From 88e765b9e6e9b45941fc9847dbf764ec4f1d84e0 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 14 Jun 2019 15:11:34 -0700 Subject: [PATCH 001/136] Fix: the binding tester was taking the min() of a list of tuples, but that could fail if the tuple contained incomparable types. Instead, use fdb.tuple.compare() to do the comparison. --- bindings/bindingtester/bindingtester.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 5a60d1112a..559244233b 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -68,6 +68,10 @@ class ResultSet(object): self.tester_results[name] = results + @staticmethod + def _min_tuple(t1, t2): + return t1 if fdb.tuple.compare(t1, t2) < 0 else t2 + def check_for_errors(self): if len(self.tester_results) == 1: return (0, False) @@ -97,7 +101,7 @@ class ResultSet(object): # If these results aren't using sequence numbers, then we match two results based on whether they share the same key else: - min_key = min([r.key(self.specification) for r in results.values()]) + min_key = reduce(ResultSet._min_tuple, [r.key(self.specification) for r in results.values()]) results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)} # Increment the indices for those testers which produced a result in this iteration From 7b12374a87d11ea103e9be89a13766685d5c8590 Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Tue, 18 Jun 2019 18:36:12 -0700 Subject: [PATCH 002/136] Fixes #1690: Server docker image hard-codes 4500 in a few places This makes the default public port for starting FDB processes the same as the FDB_PORT. This is probably necessary given #1714, especially for coordinators, though it might not be necessary for other processes in the cluster. This can *almost* be used to start up multiple FDB processes locally and then access them from the same machine, but that (unfortunately) requires both the other processes in the docker compose network and the host machine to agree on what IP to use for the coordinator. But as that machine has different IPs in those networks, they cannot be made to agree. --- packaging/docker/Dockerfile | 1 + packaging/docker/README.md | 9 +++- packaging/docker/create_cluster_file.bash | 4 +- .../docker/create_server_environment.bash | 2 +- packaging/docker/fdb.bash | 6 +-- packaging/docker/samples/local/README.md | 45 +++++++++++++++++++ .../docker/samples/local/docker-compose.yml | 32 +++++++++++++ packaging/docker/samples/local/start.bash | 39 ++++++++++++++++ packaging/docker/samples/local/stop.bash | 28 ++++++++++++ .../docker/samples/python/app/Dockerfile | 8 ++-- .../docker/samples/python/docker-compose.yml | 28 +++++++++--- 11 files changed, 185 insertions(+), 17 deletions(-) create mode 100644 packaging/docker/samples/local/README.md create mode 100644 packaging/docker/samples/local/docker-compose.yml create mode 100755 packaging/docker/samples/local/start.bash create mode 100755 packaging/docker/samples/local/stop.bash diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile index 101ba295ab..dc514870f3 100644 --- a/packaging/docker/Dockerfile +++ b/packaging/docker/Dockerfile @@ -70,5 +70,6 @@ ENV FDB_PORT 4500 ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster ENV FDB_NETWORKING_MODE container ENV FDB_COORDINATOR "" +ENV FDB_COORDINATOR_PORT 4500 ENV FDB_CLUSTER_FILE_CONTENTS "" ENV FDB_PROCESS_CLASS unset diff --git a/packaging/docker/README.md b/packaging/docker/README.md index a8d6f48de8..39fc94844a 100644 --- a/packaging/docker/README.md +++ b/packaging/docker/README.md @@ -57,6 +57,13 @@ helpful when setting up a larger cluster inside a docker network, for instance when using Docker Compose. The name you provide must be resolvable through the DNS on the container you are running. +### FDB_COORDINATOR_PORT + +The port to use for connecting to the FDB coordinator process. This should be +set by other processes in a multi-process cluster to the same value as the +`FDB_PORT` environment variable of the coordinator process. It will default +to 4500, which is also the default for `FDB_PORT`. + # Copying Into Other Images You can also use this image to provide files for images that are clients of a @@ -68,4 +75,4 @@ files you may want to copy are: library, which you can use if you are setting up a multiversion client. * `/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the cluster file based on an `FDB_COORDINATOR` environment variable. -* `/usr/bin/fdbcli`: The FoundationDB CLI. \ No newline at end of file +* `/usr/bin/fdbcli`: The FoundationDB CLI. diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash index b701b03d1a..863ca43ac8 100644 --- a/packaging/docker/create_cluster_file.bash +++ b/packaging/docker/create_cluster_file.bash @@ -39,7 +39,7 @@ function create_cluster_file() { echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2 exit 1 fi - echo "docker:docker@$coordinator_ip:4500" > $FDB_CLUSTER_FILE + echo "docker:docker@$coordinator_ip:$FDB_COORDINATOR_PORT" > $FDB_CLUSTER_FILE else echo "FDB_COORDINATOR environment variable not defined" 1>&2 exit 1 @@ -48,4 +48,4 @@ function create_cluster_file() { if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then create_cluster_file "$@" -fi \ No newline at end of file +fi diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash index 67979839b9..54d90f0854 100644 --- a/packaging/docker/create_server_environment.bash +++ b/packaging/docker/create_server_environment.bash @@ -43,4 +43,4 @@ function create_server_environment() { fi create_cluster_file -} \ No newline at end of file +} diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash index 3fb322c431..3bf1c6a680 100644 --- a/packaging/docker/fdb.bash +++ b/packaging/docker/fdb.bash @@ -23,7 +23,7 @@ source /var/fdb/scripts/create_server_environment.bash create_server_environment source /var/fdb/.fdbenv -echo "Starting FDB server on $PUBLIC_IP:4500" -fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:4500 \ +echo "Starting FDB server on $PUBLIC_IP:$FDB_PORT" +fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:$FDB_PORT \ --datadir /var/fdb/data --logdir /var/fdb/logs \ - --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS \ No newline at end of file + --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS diff --git a/packaging/docker/samples/local/README.md b/packaging/docker/samples/local/README.md new file mode 100644 index 0000000000..f7f5b3e979 --- /dev/null +++ b/packaging/docker/samples/local/README.md @@ -0,0 +1,45 @@ +# Local Docker-based FoundationDB Cluster + +This contains a sample `docker-compose.yaml` and some simple startup and teardown +scripts for running a simple single-instance FoundationDB using the Docker image +specified in this repository. This uses the `host` networking option to expose +the server process to its host machine. + +This depends on having the FoundationDB client installed on your host machine +to work properly. This can be done using one of the client packages available +on our [Download](https://www.foundationdb.org/download/) page. The startup +scripts included here depend on `fdbcli` from one of those packages, and any +client that wishes to connect will need a copy of the FoundationDB native client +in addition to its binding of choice. Both the CLI and the native client +are installed in all of our client packages + +Once those dependencies are installed, one can build the FoundationDB Docker +image: + +``` +docker build --build-arg FDB_VERSION=6.1.8 -t foundationdb:6.1.8 ../.. +``` + +Then one can start the cluster by running: + +``` +./start.bash +``` + +This starts up a single instance FoundationDB cluster using the `docker-compose.yaml` +and configures it as a new database. This will write the cluster file information to +`docker.cluster`. One should then be able to access the cluster through the CLI +or one of the bindings by using this cluster file. For example: + +``` +fdbcli --exec status -C docker.cluster +``` + +To stop the cluster, one can run: + +``` +./stop.bash +``` + +Note that all data are lost between reboots of the processes as they have not +been configured to use a persistent volume (but write to Docker's temporary file system). diff --git a/packaging/docker/samples/local/docker-compose.yml b/packaging/docker/samples/local/docker-compose.yml new file mode 100644 index 0000000000..3ce177afb5 --- /dev/null +++ b/packaging/docker/samples/local/docker-compose.yml @@ -0,0 +1,32 @@ +# docker-compose.yaml +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Specification for a one node cluster than can be accessed from the host. +# The user must specify the FDB_PORT on which it is run. + +version: '3' +services: + fdb: + image: foundationdb:6.1.8 + ports: + - $FDB_PORT:$FDB_PORT/tcp + environment: + FDB_NETWORKING_MODE: host + FDB_COORDINATOR_PORT: $FDB_PORT + FDB_PORT: $FDB_PORT diff --git a/packaging/docker/samples/local/start.bash b/packaging/docker/samples/local/start.bash new file mode 100755 index 0000000000..64def42f51 --- /dev/null +++ b/packaging/docker/samples/local/start.bash @@ -0,0 +1,39 @@ +#! /bin/bash + +# +# start.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eu + +FDB_CLUSTER_FILE="${FDB_CLUSTER_FILE:-docker.cluster}" +FDB_PORT="${FDB_PORT:-4550}" + +FDB_PORT=$FDB_PORT docker-compose up -d fdb +echo "docker:docker@127.0.0.1:$FDB_PORT" > $FDB_CLUSTER_FILE + +# Attempt to connect. Configure the database if necessary. +if ! fdbcli -C $FDB_CLUSTER_FILE --exec status --timeout 1 ; then + if ! fdbcli -C $FDB_CLUSTER_FILE --exec "configure new single memory ; status" --timeout 10 ; then + echo "Unable to configure new FDB cluster." + exit 1 + fi +fi + +echo "Can now connect to docker-based FDB cluster using $FDB_CLUSTER_FILE." diff --git a/packaging/docker/samples/local/stop.bash b/packaging/docker/samples/local/stop.bash new file mode 100755 index 0000000000..55acc50953 --- /dev/null +++ b/packaging/docker/samples/local/stop.bash @@ -0,0 +1,28 @@ +#! /bin/bash + +# +# stop.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eu + +FDB_PORT="${FDB_PORT:-4550}" + +FDB_PORT=$FDB_PORT docker-compose down +echo "Docker-based FDB cluster is now down." diff --git a/packaging/docker/samples/python/app/Dockerfile b/packaging/docker/samples/python/app/Dockerfile index 8172f5aaea..7a3ed818a2 100644 --- a/packaging/docker/samples/python/app/Dockerfile +++ b/packaging/docker/samples/python/app/Dockerfile @@ -24,9 +24,9 @@ RUN apt-get update; apt-get install -y dnsutils RUN mkdir -p /app WORKDIR /app -COPY --from=foundationdb:5.2.5 /usr/lib/libfdb_c.so /usr/lib -COPY --from=foundationdb:5.2.5 /usr/bin/fdbcli /usr/bin/ -COPY --from=foundationdb:5.2.5 /var/fdb/scripts/create_cluster_file.bash /app +COPY --from=foundationdb:6.1.8 /usr/lib/libfdb_c.so /usr/lib +COPY --from=foundationdb:6.1.8 /usr/bin/fdbcli /usr/bin/ +COPY --from=foundationdb:6.1.8 /var/fdb/scripts/create_cluster_file.bash /app COPY requirements.txt /app RUN pip install -r requirements.txt @@ -38,4 +38,4 @@ RUN chmod u+x /app/start.bash CMD /app/start.bash ENV FLASK_APP=server.py -ENV FLASK_ENV=development \ No newline at end of file +ENV FLASK_ENV=development diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml index 2280414688..34c62914a1 100644 --- a/packaging/docker/samples/python/docker-compose.yml +++ b/packaging/docker/samples/python/docker-compose.yml @@ -19,18 +19,34 @@ version: '3' services: - fdb: - image: foundationdb:5.2.5 - environment: - FDB_COORDINATOR: fdb-coordinator + # Specify three fdbserver processes. fdb-coordinator: - image: foundationdb:5.2.5 + image: foundationdb:6.1.8 environment: FDB_COORDINATOR: fdb-coordinator + fdb-server-1: + depends_on: + - fdb-coordinator + image: foundationdb:6.1.8 + environment: + FDB_COORDINATOR: fdb-coordinator + fdb-server-2: + depends_on: + - fdb-coordinator + image: foundationdb:6.1.8 + environment: + FDB_COORDINATOR: fdb-coordinator + + # Bring up the application so that it depends on the cluster. app: + depends_on: + - fdb-coordinator + - fdb-server-1 + - fdb-server-2 build: context: app ports: - 5000:5000 environment: - FDB_COORDINATOR: fdb-coordinator \ No newline at end of file + FDB_COORDINATOR: fdb-coordinator + FDB_COORDINATOR_PORT: 4550 From e0be6314145688e01c97720812c74dd9a823eb03 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 19 Jun 2019 18:15:09 -0700 Subject: [PATCH 003/136] shard the txs tag so that more transaction logs are involved in its recovery --- fdbclient/FDBTypes.h | 1 + fdbserver/LogSystem.h | 24 +++-- fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 12 +-- fdbserver/LogSystemDiskQueueAdapter.h | 15 ++- fdbserver/LogSystemPeekCursor.actor.cpp | 17 ++-- fdbserver/MasterProxyServer.actor.cpp | 10 +- fdbserver/OldTLogServer_4_6.actor.cpp | 2 +- fdbserver/OldTLogServer_6_0.actor.cpp | 35 ++++--- fdbserver/TLogServer.actor.cpp | 49 ++++++---- fdbserver/TagPartitionedLogSystem.actor.cpp | 97 ++++++++++++++++--- fdbserver/WorkerInterface.actor.h | 3 +- fdbserver/masterserver.actor.cpp | 11 ++- 12 files changed, 193 insertions(+), 83 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index edb83f5f92..ee19f11961 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -43,6 +43,7 @@ enum { tagLocalityUpgraded = -4, tagLocalitySatellite = -5, tagLocalityLogRouterMapped = -6, + tagLocalityTxs = -7, tagLocalityInvalid = -99 }; //The TLog and LogRouter require these number to be as compact as possible diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index dad3779938..caa957f8d6 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -89,9 +89,9 @@ public: return result; } - void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags) { + void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) { satelliteTagLocations.clear(); - satelliteTagLocations.resize(std::max(logRouterTags,oldLogRouterTags) + 1); + satelliteTagLocations.resize(std::max({logRouterTags,oldLogRouterTags,txsTags,oldTxsTags})+1); std::map server_usedBest; std::set> used_servers; @@ -235,7 +235,7 @@ public: bool allLocations = false) { if(locality == tagLocalitySatellite) { for(auto& t : tags) { - if(t == txsTag || t.locality == tagLocalityLogRouter) { + if(t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) { for(int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) { locations.push_back(locationOffset + loc); } @@ -520,8 +520,9 @@ struct ILogSystem { std::vector> cursors; std::vector epochEnds; Version poppedVersion; + bool needsPopped; - MultiCursor( std::vector> cursors, std::vector epochEnds ); + MultiCursor( std::vector> cursors, std::vector epochEnds, bool needsPopped = true ); virtual Reference cloneNoMore(); virtual void setProtocolVersion( ProtocolVersion version ); @@ -575,13 +576,14 @@ struct ILogSystem { LogMessageVersion messageVersion; Version end; bool hasNextMessage; + bool withTags; //FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade. bool collectTags; std::vector tags; void combineMessages(); - BufferedCursor( std::vector> cursors, Version begin, Version end, bool collectTags ); + BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags = false ); virtual Reference cloneNoMore(); virtual void setProtocolVersion( ProtocolVersion version ); @@ -652,13 +654,15 @@ struct ILogSystem { // Same contract as peek(), but can only peek from the logs elected in the same generation. // If the preferred log server is down, a different log from the same generation will merge results locally before sending them to the log router. - virtual Reference peekSpecial( UID dbgid, Version begin, Tag tag, int8_t peekLocality, Version localEnd ) = 0; - // Same contract as peek(), but it allows specifying a preferred peek locality for tags that do not have locality + virtual Reference peekTxs( UID dbgid, Version begin, int8_t peekLocality, Version localEnd ) = 0; + // Same contract as peek(), but only for peeking the txsLocality. It allows specifying a preferred peek locality. virtual Version getKnownCommittedVersion() = 0; virtual Future onKnownCommittedVersionChange() = 0; + virtual void popTxs( Version upTo, int8_t popLocality = tagLocalityInvalid ) = 0; + virtual void pop( Version upTo, Tag tag, Version knownCommittedVersion = 0, int8_t popLocality = tagLocalityInvalid ) = 0; // Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions < (upTo,0) // The popping of any given message may be arbitrarily delayed. @@ -705,6 +709,8 @@ struct ILogSystem { virtual Tag getRandomRouterTag() = 0; + virtual Tag getRandomTxsTag() = 0; + virtual void stopRejoins() = 0; // Returns the pseudo tag to be popped for the given process class. If the @@ -752,6 +758,10 @@ struct LogPushData : NonCopyable { } } + void addTxsTag() { + next_message_tags.push_back( logSystem->getRandomTxsTag() ); + } + // addTag() adds a tag for the *next* message to be added void addTag( Tag tag ) { next_message_tags.push_back( tag ); diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp index 9fed1af178..b145b8db84 100644 --- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp +++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp @@ -42,19 +42,19 @@ public: break; } when( wait( self->localityChanged ) ) { - self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); + self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); self->localityChanged = self->peekLocality->onChange(); } when( wait( delay(self->peekTypeSwitches==0 ? SERVER_KNOBS->DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME : SERVER_KNOBS->DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME)) ) { self->peekTypeSwitches++; if(self->peekTypeSwitches%3==1) { - self->cursor = self->logSystem->peek( UID(), self->recoveryLoc, self->tag, true ); + self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, tagLocalityInvalid, invalidVersion ); self->localityChanged = Never(); } else if(self->peekTypeSwitches%3==2) { - self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); + self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().secondaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); self->localityChanged = self->peekLocality->onChange(); } else { - self->cursor = self->logSystem->peekSpecial( UID(), self->recoveryLoc, self->tag, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); + self->cursor = self->logSystem->peekTxs( UID(), self->recoveryLoc, self->peekLocality ? self->peekLocality->get().primaryLocality : tagLocalityInvalid, self->peekLocality ? self->peekLocality->get().knownCommittedVersion : invalidVersion ); self->localityChanged = self->peekLocality->onChange(); } } @@ -168,6 +168,6 @@ Future LogSystemDiskQueueAdapter::getC return pcm.getFuture(); } -LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference logSystem, Tag tag, Reference> peekLocality ) { - return new LogSystemDiskQueueAdapter( logSystem, tag, peekLocality ); +LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference logSystem, Reference> peekLocality ) { + return new LogSystemDiskQueueAdapter( logSystem, peekLocality ); } diff --git a/fdbserver/LogSystemDiskQueueAdapter.h b/fdbserver/LogSystemDiskQueueAdapter.h index c4ebc2ccbe..d652ba9a5b 100644 --- a/fdbserver/LogSystemDiskQueueAdapter.h +++ b/fdbserver/LogSystemDiskQueueAdapter.h @@ -25,16 +25,16 @@ #include "fdbclient/FDBTypes.h" #include "fdbserver/IDiskQueue.h" -struct PeekSpecialInfo { +struct PeekTxsInfo { int8_t primaryLocality; int8_t secondaryLocality; Version knownCommittedVersion; - bool operator == (const PeekSpecialInfo& r) const { + bool operator == (const PeekTxsInfo& r) const { return primaryLocality == r.primaryLocality && secondaryLocality == r.secondaryLocality && knownCommittedVersion == r.knownCommittedVersion; } - PeekSpecialInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {} + PeekTxsInfo(int8_t primaryLocality, int8_t secondaryLocality, Version knownCommittedVersion) : primaryLocality(primaryLocality), secondaryLocality(secondaryLocality), knownCommittedVersion(knownCommittedVersion) {} }; class LogSystemDiskQueueAdapter : public IDiskQueue { @@ -52,10 +52,10 @@ public: // It does, however, peek the specified tag directly at recovery time. - LogSystemDiskQueueAdapter( Reference logSystem, Tag tag, Reference> peekLocality, bool recover=true ) : logSystem(logSystem), tag(tag), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) { + LogSystemDiskQueueAdapter( Reference logSystem, Reference> peekLocality, bool recover=true ) : logSystem(logSystem), peekLocality(peekLocality), enableRecovery(recover), recoveryLoc(1), recoveryQueueLoc(1), poppedUpTo(0), nextCommit(1), recoveryQueueDataSize(0), peekTypeSwitches(0) { if (enableRecovery) { localityChanged = peekLocality ? peekLocality->onChange() : Never(); - cursor = logSystem->peekSpecial( UID(), 1, tag, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion ); + cursor = logSystem->peekTxs( UID(), 1, peekLocality ? peekLocality->get().primaryLocality : tagLocalityInvalid, peekLocality ? peekLocality->get().knownCommittedVersion : invalidVersion ); } } @@ -92,11 +92,10 @@ public: virtual int getCommitOverhead() { return 0; } //SOMEDAY: could this be more accurate? private: - Reference> peekLocality; + Reference> peekLocality; Future localityChanged; Reference cursor; int peekTypeSwitches; - Tag tag; // Recovery state (used while readNext() is being called repeatedly) bool enableRecovery; @@ -114,6 +113,6 @@ private: friend class LogSystemDiskQueueAdapterImpl; }; -LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference logSystem, Tag tag, Reference> peekLocality ); +LogSystemDiskQueueAdapter* openDiskQueueAdapter( Reference logSystem, Reference> peekLocality ); #endif diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index ecf1877536..e58d365204 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -797,7 +797,7 @@ Version ILogSystem::SetPeekCursor::popped() { return poppedVersion; } -ILogSystem::MultiCursor::MultiCursor( std::vector> cursors, std::vector epochEnds ) : cursors(cursors), epochEnds(epochEnds), poppedVersion(0) { +ILogSystem::MultiCursor::MultiCursor( std::vector> cursors, std::vector epochEnds, bool needsPopped ) : cursors(cursors), epochEnds(epochEnds), needsPopped(needsPopped), poppedVersion(0) { for(int i = 0; i < std::min(cursors.size(),SERVER_KNOBS->MULTI_CURSOR_PRE_FETCH_LIMIT); i++) { cursors[cursors.size()-i-1]->getMore(); } @@ -841,7 +841,7 @@ const std::vector& ILogSystem::MultiCursor::getTags() { void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) { while( cursors.size() > 1 && n >= epochEnds.back() ) { - poppedVersion = std::max(poppedVersion, cursors.back()->popped()); + if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped()); cursors.pop_back(); epochEnds.pop_back(); } @@ -851,7 +851,7 @@ void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) { Future ILogSystem::MultiCursor::getMore(int taskID) { LogMessageVersion startVersion = cursors.back()->version(); while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) { - poppedVersion = std::max(poppedVersion, cursors.back()->popped()); + if(needsPopped) poppedVersion = std::max(poppedVersion, cursors.back()->popped()); cursors.pop_back(); epochEnds.pop_back(); } @@ -882,10 +882,11 @@ Version ILogSystem::MultiCursor::getMinKnownCommittedVersion() { } Version ILogSystem::MultiCursor::popped() { + ASSERT(needsPopped); return std::max(poppedVersion, cursors.back()->popped()); } -ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), collectTags(collectTags), hasNextMessage(false), messageIndex(0) { +ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0) { messages.reserve(10000); } @@ -948,15 +949,17 @@ void ILogSystem::BufferedCursor::nextMessage() { } StringRef ILogSystem::BufferedCursor::getMessage() { - ASSERT(false); - return StringRef(); + ASSERT(!withTags); + return messages[messageIndex].message; } StringRef ILogSystem::BufferedCursor::getMessageWithTags() { + ASSERT(withTags); return messages[messageIndex].message; } const std::vector& ILogSystem::BufferedCursor::getTags() { + ASSERT(withTags); return messages[messageIndex].tags; } @@ -971,7 +974,7 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe return Void(); } while(cursor->hasMessage()) { - self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), self->collectTags ? cursor->getMessage() : cursor->getMessageWithTags(), cursor->getTags(), cursor->version())); + self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector() : cursor->getTags(), cursor->version())); cursor->nextMessage(); if(cursor->version().version >= maxVersion) { return Void(); diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 3fc4665a15..7b68e4d646 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -986,7 +986,7 @@ ACTOR Future commitBatch( bool firstMessage = true; for(auto m : msg.messages) { if(firstMessage) { - toCommit.addTag(txsTag); + toCommit.addTxsTag(); } toCommit.addMessage(StringRef(m.begin(), m.size()), !firstMessage); firstMessage = false; @@ -1033,7 +1033,7 @@ ACTOR Future commitBatch( self->txsPopVersions.emplace_back(commitVersion, msg.popTo); } - self->logSystem->pop(msg.popTo, txsTag); + self->logSystem->popTxs(msg.popTo); /////// Phase 5: Replies (CPU bound; no particular order required, though ordered execution would be best for latency) if ( prevVersion && commitVersion - prevVersion < SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT/2 ) @@ -1505,7 +1505,7 @@ ACTOR Future monitorRemoteCommitted(ProxyCommitData* self) { while(self->txsPopVersions.size() && self->txsPopVersions.front().first <= minVersion) { self->lastTxsPop = self->txsPopVersions.front().second; - self->logSystem->pop(self->txsPopVersions.front().second, txsTag, 0, tagLocalityRemoteLog); + self->logSystem->popTxs(self->txsPopVersions.front().second, tagLocalityRemoteLog); self->txsPopVersions.pop_front(); } @@ -1563,7 +1563,7 @@ ACTOR Future masterProxyServerCore( r->value().emplace_back(0,0); commitData.logSystem = ILogSystem::fromServerDBInfo(proxy.id(), commitData.db->get(), false, addActor); - commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, txsTag, Reference>(), false); + commitData.logAdapter = new LogSystemDiskQueueAdapter(commitData.logSystem, Reference>(), false); commitData.txnStateStore = keyValueStoreLogSystem(commitData.logAdapter, proxy.id(), 2e9, true, true, true); createWhitelistBinPathVec(whitelistBinPaths, commitData.whitelistedBinPathVec); @@ -1595,7 +1595,7 @@ ACTOR Future masterProxyServerCore( for(auto it : commitData.tag_popped) { commitData.logSystem->pop(it.second, it.first); } - commitData.logSystem->pop(commitData.lastTxsPop, txsTag, 0, tagLocalityRemoteLog); + commitData.logSystem->popTxs(commitData.lastTxsPop, tagLocalityRemoteLog); } Optional newLatencyBandConfig = commitData.db->get().latencyBandConfig; diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index fd2be1f08f..7be4599a7e 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -48,7 +48,7 @@ namespace oldTLog_4_6 { typedef int16_t OldTag; OldTag convertTag( Tag tag ) { - if(tag == invalidTag) return invalidTagOld; + if(tag == invalidTag || tag.locality == tagLocalityTxs) return invalidTagOld; if(tag == txsTag) return txsTagOld; ASSERT(tag.id >= 0); return tag.id; diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index fc9251ec78..6e3034821e 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -195,6 +195,7 @@ static const KeyRangeRef persistCurrentVersionKeys = KeyRangeRef( LiteralStringR static const KeyRangeRef persistKnownCommittedVersionKeys = KeyRangeRef( LiteralStringRef( "knownCommitted/" ), LiteralStringRef( "knownCommitted0" ) ); static const KeyRangeRef persistLocalityKeys = KeyRangeRef( LiteralStringRef( "Locality/" ), LiteralStringRef( "Locality0" ) ); static const KeyRangeRef persistLogRouterTagsKeys = KeyRangeRef( LiteralStringRef( "LogRouterTags/" ), LiteralStringRef( "LogRouterTags0" ) ); +static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef( LiteralStringRef( "TxsTags/" ), LiteralStringRef( "TxsTags0" ) ); static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/")); static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/")); @@ -333,7 +334,7 @@ struct LogData : NonCopyable, public ReferenceCounted { auto const& m = self->versionMessages.front(); ++messagesErased; - if(self->tag != txsTag) { + if(self->tag.locality != tagLocalityTxs && self->tag != txsTag) { sizes.first -= m.second.expectedSize(); } else { sizes.second -= m.second.expectedSize(); @@ -433,9 +434,10 @@ struct LogData : NonCopyable, public ReferenceCounted { Future terminated; FlowLock execOpLock; bool execOpCommitInProgress; + int txsTags; - explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, std::vector tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()), - cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID), + explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, int txsTags, UID recruitmentID, std::vector tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()), + cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), txsTags(txsTags), recruitmentID(recruitmentID), logSystem(new AsyncVar>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), // These are initialized differently on init() or recovery recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0), @@ -482,6 +484,7 @@ struct LogData : NonCopyable, public ReferenceCounted { tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)) ); + tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) ); Key msgKey = logIdKey.withPrefix(persistTagMessagesKeys.begin); tLogData->persistentData->clear( KeyRangeRef( msgKey, strinc(msgKey) ) ); @@ -814,7 +817,7 @@ void commitMessages( TLogData* self, Reference logData, Version version block.append(block.arena(), msg.message.begin(), msg.message.size()); for(auto tag : msg.tags) { if(logData->locality == tagLocalitySatellite) { - if(!(tag == txsTag || tag.locality == tagLocalityLogRouter)) { + if(!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) { continue; } } else if(!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) { @@ -827,6 +830,9 @@ void commitMessages( TLogData* self, Reference logData, Version version } tag.id = tag.id % logData->logRouterTags; } + if(tag.locality == tagLocalityTxs) { + tag.id = tag.id % logData->txsTags; + } Reference tagData = logData->getTagData(tag); if(!tagData) { tagData = logData->createTagData(tag, 0, true, true, false); @@ -837,7 +843,7 @@ void commitMessages( TLogData* self, Reference logData, Version version if(tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) { TraceEvent(SevWarnAlways, "LargeMessage").detail("Size", tagData->versionMessages.back().second.expectedSize()); } - if (tag != txsTag) { + if (tag.locality != tagLocalityTxs && tag != txsTag) { expectedBytes += tagData->versionMessages.back().second.expectedSize(); } else { txsBytes += tagData->versionMessages.back().second.expectedSize(); @@ -905,7 +911,7 @@ std::deque> & getVersionMessages( Re }; ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest && inputTag != txsTag) { + if (self->ignorePopRequest && inputTag.locality != tagLocalityTxs && inputTag != txsTag) { TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); if (self->toBePopped.find(inputTag) == self->toBePopped.end() @@ -1062,7 +1068,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere wait( delay(0.0, TaskLowPriority) ); } - if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { + if( req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1303,7 +1309,7 @@ void execProcessingHelper(TLogData* self, rd >> messageLength >> sub >> tagCount; for (int i = 0; i < tagCount; i++) { rd >> tmpTag; - if (tmpTag == txsTag) { + if (tmpTag.locality == tagLocalityTxs || tmpTag == txsTag) { hasTxsTag = true; } execTags->push_back(execTags->arena(), tmpTag); @@ -1632,6 +1638,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLocalityKeys.begin), BinaryWriter::toValue(logData->locality, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLogRouterTagsKeys.begin), BinaryWriter::toValue(logData->logRouterTags, Unversioned()) ) ); + storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistTxsTagsKeys.begin), BinaryWriter::toValue(logData->txsTags, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistRecoveryCountKeys.begin), BinaryWriter::toValue(logData->recoveryCount, Unversioned()) ) ); for(auto tag : logData->allTags) { @@ -2039,12 +2046,13 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality state Future>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys); state Future>> fLocality = storage->readRange(persistLocalityKeys); state Future>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys); + state Future>> fTxsTags = storage->readRange(persistTxsTagsKeys); state Future>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys); // FIXME: metadata in queue? wait( waitForAll( (vector>>(), fFormat ) ) ); - wait( waitForAll( (vector>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fRecoverCounts) ) ); + wait( waitForAll( (vector>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fTxsTags, fRecoverCounts) ) ); if (fFormat.get().present() && !persistFormatReadableRange.contains( fFormat.get().get() )) { //FIXME: remove when we no longer need to test upgrades from 4.X releases @@ -2096,6 +2104,11 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality id_logRouterTags[ BinaryReader::fromStringRef(it.key.removePrefix(persistLogRouterTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); } + state std::map id_txsTags; + for(auto it : fTxsTags.get()) { + id_txsTags[ BinaryReader::fromStringRef(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); + } + state std::map id_knownCommitted; for(auto it : fKnownCommitted.get()) { id_knownCommitted[ BinaryReader::fromStringRef(it.key.removePrefix(persistKnownCommittedVersionKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); @@ -2121,7 +2134,7 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality DUMPTOKEN( recruited.confirmRunning ); //We do not need the remoteTag, because we will not be loading any additional data - logData = Reference( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], UID(), std::vector()) ); + logData = Reference( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], id_txsTags[id1], UID(), std::vector()) ); logData->locality = id_locality[id1]; logData->stopped = true; self->id_data[id1] = logData; @@ -2304,7 +2317,7 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit it.second->stopCommit.trigger(); } - state Reference logData = Reference( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.recruitmentID, req.allTags) ); + state Reference logData = Reference( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, req.allTags) ); self->id_data[recruited.id()] = logData; logData->locality = req.locality; logData->recoveryCount = req.epoch; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 52d0079ab7..e34052a0a4 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -205,6 +205,7 @@ static const KeyRangeRef persistKnownCommittedVersionKeys = KeyRangeRef( Literal static const KeyRef persistRecoveryLocationKey = KeyRef( LiteralStringRef( "recoveryLocation" ) ); static const KeyRangeRef persistLocalityKeys = KeyRangeRef( LiteralStringRef( "Locality/" ), LiteralStringRef( "Locality0" ) ); static const KeyRangeRef persistLogRouterTagsKeys = KeyRangeRef( LiteralStringRef( "LogRouterTags/" ), LiteralStringRef( "LogRouterTags0" ) ); +static const KeyRangeRef persistTxsTagsKeys = KeyRangeRef( LiteralStringRef( "TxsTags/" ), LiteralStringRef( "TxsTags0" ) ); static const KeyRange persistTagMessagesKeys = prefixRange(LiteralStringRef("TagMsg/")); static const KeyRange persistTagMessageRefsKeys = prefixRange(LiteralStringRef("TagMsgRef/")); static const KeyRange persistTagPoppedKeys = prefixRange(LiteralStringRef("TagPop/")); @@ -389,7 +390,7 @@ struct LogData : NonCopyable, public ReferenceCounted { auto const& m = self->versionMessages.front(); ++messagesErased; - if(self->tag != txsTag) { + if(self->tag.locality != tagLocalityTxs && self->tag != txsTag) { sizes.first -= m.second.expectedSize(); } else { sizes.second -= m.second.expectedSize(); @@ -491,9 +492,10 @@ struct LogData : NonCopyable, public ReferenceCounted { Future terminated; FlowLock execOpLock; bool execOpCommitInProgress; + int txsTags; - explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, UID recruitmentID, ProtocolVersion protocolVersion, std::vector tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()), - cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), recruitmentID(recruitmentID), protocolVersion(protocolVersion), + explicit LogData(TLogData* tLogData, TLogInterface interf, Tag remoteTag, bool isPrimary, int logRouterTags, int txsTags, UID recruitmentID, ProtocolVersion protocolVersion, std::vector tags) : tLogData(tLogData), knownCommittedVersion(0), logId(interf.id()), + cc("TLog", interf.id().toString()), bytesInput("BytesInput", cc), bytesDurable("BytesDurable", cc), remoteTag(remoteTag), isPrimary(isPrimary), logRouterTags(logRouterTags), txsTags(txsTags), recruitmentID(recruitmentID), protocolVersion(protocolVersion), logSystem(new AsyncVar>()), logRouterPoppedVersion(0), durableKnownCommittedVersion(0), minKnownCommittedVersion(0), queuePoppedVersion(0), allTags(tags.begin(), tags.end()), terminated(tLogData->terminated.getFuture()), // These are initialized differently on init() or recovery recoveryCount(), stopped(false), initialized(false), queueCommittingVersion(0), newPersistentDataVersion(invalidVersion), unrecoveredBefore(1), recoveredAt(1), unpoppedRecoveredTags(0), @@ -542,6 +544,7 @@ struct LogData : NonCopyable, public ReferenceCounted { tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistKnownCommittedVersionKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLocalityKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistLogRouterTagsKeys.begin)) ); + tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistTxsTagsKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryCountKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistProtocolVersionKeys.begin)) ); tLogData->persistentData->clear( singleKeyRange(logIdKey.withPrefix(persistRecoveryLocationKey)) ); @@ -637,7 +640,7 @@ void updatePersistentPopped( TLogData* self, Reference logData, Referen if (data->nothingPersistent) return; - if (data->tag == txsTag) { + if (data->tag.locality == tagLocalityTxs || data->tag == txsTag) { self->persistentData->clear( KeyRangeRef( persistTagMessagesKey( logData->logId, data->tag, Version(0) ), persistTagMessagesKey( logData->logId, data->tag, data->popped ) ) ); @@ -654,7 +657,7 @@ void updatePersistentPopped( TLogData* self, Reference logData, Referen ACTOR Future updatePoppedLocation( TLogData* self, Reference logData, Reference data ) { // txsTag is spilled by value, so we do not need to track its popped location. - if (data->tag == txsTag) { + if (data->tag.locality == tagLocalityTxs || data->tag == txsTag) { return Void(); } @@ -724,7 +727,7 @@ ACTOR Future popDiskQueue( TLogData* self, Reference logData ) { for(int tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) { for(int tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { Reference tagData = logData->tag_data[tagLocality][tagId]; - if (tagData && tagData->tag != txsTag && !tagData->nothingPersistent) { + if (tagData && tagData->tag.locality != tagLocalityTxs && tagData->tag != txsTag && !tagData->nothingPersistent) { minLocation = std::min(minLocation, tagData->poppedLocation); minVersion = std::min(minVersion, tagData->popped); } @@ -783,7 +786,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD anyData = true; tagData->nothingPersistent = false; - if (tagData->tag == txsTag) { + if (tagData->tag.locality == tagLocalityTxs || tagData->tag == txsTag) { // spill txsTag by value wr = BinaryWriter( Unversioned() ); for(; msg != tagData->versionMessages.end() && msg->first == currentVersion; ++msg) { @@ -889,7 +892,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { Reference tagData = logData->tag_data[tagLocality][tagId]; if (tagData) { - if (tagData->tag == txsTag) { + if (tagData->tag.locality == tagLocalityTxs || tagData->tag == txsTag) { minVersion = std::min(minVersion, newPersistentDataVersion); } else { minVersion = std::min(minVersion, tagData->popped); @@ -1064,7 +1067,7 @@ void commitMessages( TLogData* self, Reference logData, Version version block.append(block.arena(), msg.message.begin(), msg.message.size()); for(auto tag : msg.tags) { if(logData->locality == tagLocalitySatellite) { - if(!(tag == txsTag || tag.locality == tagLocalityLogRouter)) { + if(!(tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || tag == txsTag)) { continue; } } else if(!(logData->locality == tagLocalitySpecial || logData->locality == tag.locality || tag.locality < 0)) { @@ -1077,6 +1080,9 @@ void commitMessages( TLogData* self, Reference logData, Version version } tag.id = tag.id % logData->logRouterTags; } + if(tag.locality == tagLocalityTxs) { + tag.id = tag.id % logData->txsTags; + } Reference tagData = logData->getTagData(tag); if(!tagData) { tagData = logData->createTagData(tag, 0, true, true, false); @@ -1087,7 +1093,7 @@ void commitMessages( TLogData* self, Reference logData, Version version if(tagData->versionMessages.back().second.expectedSize() > SERVER_KNOBS->MAX_MESSAGE_SIZE) { TraceEvent(SevWarnAlways, "LargeMessage").detail("Size", tagData->versionMessages.back().second.expectedSize()); } - if (tag != txsTag) { + if (tag.locality != tagLocalityTxs && tag != txsTag) { expectedBytes += tagData->versionMessages.back().second.expectedSize(); } else { txsBytes += tagData->versionMessages.back().second.expectedSize(); @@ -1155,7 +1161,7 @@ std::deque> & getVersionMessages( Re }; ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest && inputTag != txsTag) { + if (self->ignorePopRequest && inputTag.locality != tagLocalityTxs && inputTag != txsTag) { TraceEvent("IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); if (self->toBePopped.find(inputTag) == self->toBePopped.end() @@ -1296,7 +1302,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere state BinaryWriter messages2(Unversioned()); state int sequence = -1; state UID peekId; - + if(req.sequence.present()) { try { peekId = req.sequence.get().first; @@ -1349,7 +1355,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere wait( delay(0.0, TaskLowPriority) ); } - if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { + if( req.begin <= logData->persistentDataDurableVersion && req.tag.locality != tagLocalityTxs && req.tag != txsTag) { // Reading spilled data will almost always imply that the storage server is >5s behind the rest // of the cluster. We shouldn't prioritize spending CPU on helping this server catch up // slightly faster over keeping the rest of the cluster operating normally. @@ -1402,7 +1408,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere peekMessagesFromMemory( logData, req, messages2, endVersion ); - if (req.tag == txsTag) { + if (req.tag.locality == tagLocalityTxs || req.tag == txsTag) { Standalone> kvs = wait( self->persistentData->readRange(KeyRangeRef( persistTagMessagesKey(logData->logId, req.tag, req.begin), @@ -1670,7 +1676,7 @@ void execProcessingHelper(TLogData* self, rd >> messageLength >> sub >> tagCount; for (int i = 0; i < tagCount; i++) { rd >> tmpTag; - if (tmpTag == txsTag) { + if (tmpTag.locality == tagLocalityTxs || tmpTag == txsTag) { hasTxsTag = true; } execTags->push_back(execTags->arena(), tmpTag); @@ -2001,6 +2007,7 @@ ACTOR Future initPersistentState( TLogData* self, Reference logDa storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistKnownCommittedVersionKeys.begin), BinaryWriter::toValue(logData->knownCommittedVersion, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLocalityKeys.begin), BinaryWriter::toValue(logData->locality, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistLogRouterTagsKeys.begin), BinaryWriter::toValue(logData->logRouterTags, Unversioned()) ) ); + storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistTxsTagsKeys.begin), BinaryWriter::toValue(logData->txsTags, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistRecoveryCountKeys.begin), BinaryWriter::toValue(logData->recoveryCount, Unversioned()) ) ); storage->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistProtocolVersionKeys.begin), BinaryWriter::toValue(logData->protocolVersion, Unversioned()) ) ); @@ -2417,13 +2424,14 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality state Future>> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys); state Future>> fLocality = storage->readRange(persistLocalityKeys); state Future>> fLogRouterTags = storage->readRange(persistLogRouterTagsKeys); + state Future>> fTxsTags = storage->readRange(persistTxsTagsKeys); state Future>> fRecoverCounts = storage->readRange(persistRecoveryCountKeys); state Future>> fProtocolVersions = storage->readRange(persistProtocolVersionKeys); // FIXME: metadata in queue? wait( waitForAll( (vector>>(), fFormat, fRecoveryLocation ) ) ); - wait( waitForAll( (vector>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fRecoverCounts, fProtocolVersions ) ) ); + wait( waitForAll( (vector>>>(), fVers, fKnownCommitted, fLocality, fLogRouterTags, fTxsTags, fRecoverCounts, fProtocolVersions ) ) ); if (fFormat.get().present() && !persistFormatReadableRange.contains( fFormat.get().get() )) { //FIXME: remove when we no longer need to test upgrades from 4.X releases @@ -2465,6 +2473,11 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality id_logRouterTags[ BinaryReader::fromStringRef(it.key.removePrefix(persistLogRouterTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); } + state std::map id_txsTags; + for(auto it : fTxsTags.get()) { + id_txsTags[ BinaryReader::fromStringRef(it.key.removePrefix(persistTxsTagsKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); + } + state std::map id_knownCommitted; for(auto it : fKnownCommitted.get()) { id_knownCommitted[ BinaryReader::fromStringRef(it.key.removePrefix(persistKnownCommittedVersionKeys.begin), Unversioned())] = BinaryReader::fromStringRef( it.value, Unversioned() ); @@ -2498,7 +2511,7 @@ ACTOR Future restorePersistentState( TLogData* self, LocalityData locality ProtocolVersion protocolVersion = BinaryReader::fromStringRef( fProtocolVersions.get()[idx].value, Unversioned() ); //We do not need the remoteTag, because we will not be loading any additional data - logData = Reference( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], UID(), protocolVersion, std::vector()) ); + logData = Reference( new LogData(self, recruited, Tag(), true, id_logRouterTags[id1], id_txsTags[id1], UID(), protocolVersion, std::vector()) ); logData->locality = id_locality[id1]; logData->stopped = true; self->id_data[id1] = logData; @@ -2700,7 +2713,7 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit it.second->stopCommit.trigger(); } - state Reference logData = Reference( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.recruitmentID, currentProtocolVersion, req.allTags) ); + state Reference logData = Reference( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, currentProtocolVersion, req.allTags) ); self->id_data[recruited.id()] = logData; logData->locality = req.locality; logData->recoveryCount = req.epoch; diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2e25daae3b..eafb2e9554 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -454,7 +454,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if(log->locality != tagLocalitySatellite) { @@ -481,7 +481,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if(log->locality != tagLocalitySatellite) { @@ -624,7 +624,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), tLogs[0]->locality == tagLocalityUpgraded) ); + return Reference( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded) ); } Reference peekLocal( UID dbgid, Tag tag, Version begin, Version end, bool useMergePeekCursors, int8_t peekLocality = tagLocalityInvalid ) { @@ -682,7 +682,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted peekSpecial( UID dbgid, Version begin, Tag tag, int8_t peekLocality, Version localEnd ) { + virtual Reference peekTxs( UID dbgid, Version begin, int8_t peekLocality, Version localEnd ) { Version end = getEnd(); - TraceEvent("TLogPeekSpecial", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality); + if(!tLogs.size()) { + TraceEvent("TLogPeekTxsNoLogs", dbgid); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), txsTag, begin, end, false, false ) ); + } + TraceEvent("TLogPeekTxs", dbgid).detail("Begin", begin).detail("End", end).detail("LocalEnd", localEnd).detail("PeekLocality", peekLocality); + if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { - return peekAll(dbgid, begin, end, tag, true); + std::vector< Reference > cursors; + for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); + } + //SOMEDAY: remove once upgrades from 6.2 are no longer supported + cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); + + return Reference( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } try { if(localEnd >= end) { - return peekLocal(dbgid, tag, begin, end, true, peekLocality); + std::vector< Reference > cursors; + for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality)); + } + //SOMEDAY: remove once upgrades from 6.2 are no longer supported + cursors.push_back(peekLocal(dbgid, txsTag, begin, end, true, peekLocality)); + + return Reference( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } std::vector< Reference > cursors; std::vector< LogMessageVersion > epochEnds; cursors.resize(2); - cursors[1] = peekLocal(dbgid, tag, begin, localEnd, true, peekLocality); - cursors[0] = peekAll(dbgid, localEnd, end, tag, true); + + std::vector< Reference > localCursors; + std::vector< Reference > allCursors; + for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality)); + allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true)); + } + //SOMEDAY: remove once upgrades from 6.2 are no longer supported + localCursors.push_back(peekLocal(dbgid, txsTag, begin, localEnd, true, peekLocality)); + allCursors.push_back(peekAll(dbgid, localEnd, end, txsTag, true)); + + cursors[1] = Reference( new ILogSystem::BufferedCursor(localCursors, begin, localEnd, false) ); + cursors[0] = Reference( new ILogSystem::BufferedCursor(allCursors, localEnd, end, false) ); epochEnds.emplace_back(localEnd); - return Reference( new ILogSystem::MultiCursor(cursors, epochEnds) ); + return Reference( new ILogSystem::MultiCursor(cursors, epochEnds, false) ); } catch( Error& e ) { if(e.code() == error_code_worker_removed) { - return peekAll(dbgid, begin, end, tag, true); + std::vector< Reference > cursors; + for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); + } + //SOMEDAY: remove once upgrades from 6.2 are no longer supported + cursors.push_back(peekAll(dbgid, begin, end, txsTag, true)); + + return Reference( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } throw; } @@ -909,6 +946,16 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogServers.size(); i++) { + pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality); + } + } + //SOMEDAY: remove once upgrades from 6.2 are no longer supported + pop(upTo, txsTag, 0, popLocality); + } + virtual void pop( Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality ) { if (upTo <= 0) return; if( tag.locality == tagLocalityRemoteLog) { @@ -1126,6 +1173,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedrandomInt(0, logRouterTags)); } + virtual Tag getRandomTxsTag() { + ASSERT(tLogs.size()); + return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, tLogs[0]->logServers.size())); + } + ACTOR static Future monitorLog(Reference>> logServer, Reference> failed) { state Future waitFailure; loop { @@ -1730,6 +1782,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedstartVersion; req.logRouterTags = 0; + req.txsTags = self->tLogs[0]->logServers.size(); } logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() ); @@ -1823,7 +1876,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[1]->logServers.resize( recr.satelliteTLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); - logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags); + logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),oldLogSystem->tLogs.size() ? oldLogSystem->tLogs[0]->logServers.size() : 0); logSystem->expectedLogSets++; } @@ -1903,6 +1956,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->startVersion; req.logRouterTags = logSystem->logRouterTags; + req.txsTags = recr.tLogs.size(); } logSystem->tLogs[0]->tLogLocalities.resize( recr.tLogs.size() ); @@ -1927,7 +1981,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> recoveryComplete; if(region.satelliteTLogReplicationFactor > 0) { - std::vector satelliteTags(1, txsTag); + std::vector satelliteTags; + for(int i = 0; i < recr.tLogs.size(); i++) { + satelliteTags.push_back(Tag(tagLocalityTxs, i)); + } + satelliteTags.push_back(txsTag); state vector> satelliteInitializationReplies; vector< InitializeTLogRequest > sreqs( recr.satelliteTLogs.size() ); @@ -1947,6 +2005,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedknownCommittedVersion + 1; req.logRouterTags = logSystem->logRouterTags; + req.txsTags = recr.tLogs.size(); } for(int i = -1; i < oldLogSystem->logRouterTags; i++) { @@ -1957,6 +2016,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + sreqs[ loc ].recoverTags.push_back( tag ); + } + for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) satelliteInitializationReplies.push_back( transformErrors( throwErrorOr( recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) ); diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 8370e7fdde..d739f8770c 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -108,6 +108,7 @@ struct InitializeTLogRequest { bool isPrimary; Version startVersion; int logRouterTags; + int txsTags; ReplyPromise< struct TLogInterface > reply; @@ -115,7 +116,7 @@ struct InitializeTLogRequest { template void serialize( Ar& ar ) { - serializer(ar, recruitmentID, recoverFrom, recoverAt, knownCommittedVersion, epoch, recoverTags, allTags, storeType, remoteTag, locality, isPrimary, startVersion, logRouterTags, reply, logVersion, spillType); + serializer(ar, recruitmentID, recoverFrom, recoverAt, knownCommittedVersion, epoch, recoverTags, allTags, storeType, remoteTag, locality, isPrimary, startVersion, logRouterTags, reply, logVersion, spillType, txsTags); } }; diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 205b1dbc19..55de67106a 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -605,21 +605,21 @@ ACTOR Future>> recruitEverything( Refere return confChanges; } -ACTOR Future updateLocalityForDcId(Optional dcId, Reference oldLogSystem, Reference> locality) { +ACTOR Future updateLocalityForDcId(Optional dcId, Reference oldLogSystem, Reference> locality) { loop { std::pair loc = oldLogSystem->getLogSystemConfig().getLocalityForDcId(dcId); Version ver = locality->get().knownCommittedVersion; if(ver == invalidVersion) { ver = oldLogSystem->getKnownCommittedVersion(); } - locality->set( PeekSpecialInfo(loc.first,loc.second,ver) ); + locality->set( PeekTxsInfo(loc.first,loc.second,ver) ); TraceEvent("UpdatedLocalityForDcId").detail("DcId", dcId).detail("Locality0", loc.first).detail("Locality1", loc.second).detail("Version", ver); wait( oldLogSystem->onLogSystemConfigChange() || oldLogSystem->onKnownCommittedVersionChange() ); } } ACTOR Future readTransactionSystemState( Reference self, Reference oldLogSystem ) { - state Reference> myLocality = Reference>( new AsyncVar(PeekSpecialInfo(tagLocalityInvalid,tagLocalityInvalid,invalidVersion) ) ); + state Reference> myLocality = Reference>( new AsyncVar(PeekTxsInfo(tagLocalityInvalid,tagLocalityInvalid,invalidVersion) ) ); state Future localityUpdater = updateLocalityForDcId(self->myInterface.locality.dcId(), oldLogSystem, myLocality); // Peek the txnStateTag in oldLogSystem and recover self->txnStateStore @@ -630,7 +630,7 @@ ACTOR Future readTransactionSystemState( Reference self, Refer // Recover transaction state store if(self->txnStateStore) self->txnStateStore->close(); - self->txnStateLogAdapter = openDiskQueueAdapter( oldLogSystem, txsTag, myLocality ); + self->txnStateLogAdapter = openDiskQueueAdapter( oldLogSystem, myLocality ); self->txnStateStore = keyValueStoreLogSystem( self->txnStateLogAdapter, self->dbgid, self->memoryLimit, false, false, true ); // Versionstamped operations (particularly those applied from DR) define a minimum commit version @@ -676,6 +676,9 @@ ACTOR Future readTransactionSystemState( Reference self, Refer Standalone> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) ); self->allTags.clear(); if(self->lastEpochEnd > 0) { + for(int i = 0; i < oldLogSystem->getLogSystemConfig().tLogs[0].tLogs.size(); i++) { + self->allTags.push_back(Tag(tagLocalityTxs, i)); + } self->allTags.push_back(txsTag); } From c92324b8948cfef41a94acd59c744826e5959f1c Mon Sep 17 00:00:00 2001 From: Alec Grieser Date: Thu, 20 Jun 2019 08:54:12 -0700 Subject: [PATCH 004/136] python sample docker app uses default coordinator port --- packaging/docker/create_cluster_file.bash | 5 +++-- packaging/docker/samples/python/docker-compose.yml | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash index 863ca43ac8..c1bb959b8e 100644 --- a/packaging/docker/create_cluster_file.bash +++ b/packaging/docker/create_cluster_file.bash @@ -39,7 +39,8 @@ function create_cluster_file() { echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2 exit 1 fi - echo "docker:docker@$coordinator_ip:$FDB_COORDINATOR_PORT" > $FDB_CLUSTER_FILE + coordinator_port=${FDB_COORDINATOR_PORT:-4500} + echo "docker:docker@$coordinator_ip:$coordinator_port" > $FDB_CLUSTER_FILE else echo "FDB_COORDINATOR environment variable not defined" 1>&2 exit 1 @@ -47,5 +48,5 @@ function create_cluster_file() { } if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - create_cluster_file "$@" + create_cluster_file "$@" fi diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml index 34c62914a1..e239bff80f 100644 --- a/packaging/docker/samples/python/docker-compose.yml +++ b/packaging/docker/samples/python/docker-compose.yml @@ -46,7 +46,6 @@ services: build: context: app ports: - - 5000:5000 + - 5000:5000/tcp environment: FDB_COORDINATOR: fdb-coordinator - FDB_COORDINATOR_PORT: 4550 From 76ba4e60b70ee9f7993869a311fdc0c1c591dd65 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 24 Jun 2019 13:03:35 -0700 Subject: [PATCH 005/136] fixed a stack overflow bug --- fdbserver/LogSystemDiskQueueAdapter.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp index b145b8db84..af2323923a 100644 --- a/fdbserver/LogSystemDiskQueueAdapter.actor.cpp +++ b/fdbserver/LogSystemDiskQueueAdapter.actor.cpp @@ -66,6 +66,7 @@ public: } if(!self->cursor->hasMessage()) { self->recoveryLoc = self->cursor->version().version; + wait(delay(0)); continue; } } From 7a500cd37f10c135321e27eac5b541a4e504efd1 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 25 Jun 2019 02:47:35 -0700 Subject: [PATCH 006/136] A giant translation of TaskFooPriority -> TaskPriority::Foo This is so that APIs that take priorities don't take ints, which are common and easy to accidentally pass the wrong thing. --- bindings/flow/fdb_flow.actor.cpp | 4 +- fdbclient/BackupAgentBase.actor.cpp | 10 +- fdbclient/ClusterInterface.h | 12 +- fdbclient/DatabaseContext.h | 6 +- fdbclient/FailureMonitorClient.actor.cpp | 14 +-- fdbclient/HTTP.actor.cpp | 4 +- fdbclient/ManagementAPI.actor.cpp | 4 +- fdbclient/MasterProxyInterface.h | 8 +- fdbclient/MonitorLeader.actor.cpp | 4 +- fdbclient/NativeAPI.actor.cpp | 48 ++++---- fdbclient/NativeAPI.actor.h | 6 +- fdbclient/StatusClient.actor.cpp | 2 +- fdbclient/StorageServerInterface.h | 6 +- fdbclient/VersionedMap.actor.h | 2 +- fdbclient/VersionedMap.h | 2 +- fdbrpc/AsyncFileEIO.actor.h | 16 +-- fdbrpc/AsyncFileKAIO.actor.h | 6 +- fdbrpc/AsyncFileNonDurable.actor.cpp | 4 +- fdbrpc/AsyncFileNonDurable.actor.h | 22 ++-- fdbrpc/FlowTests.actor.cpp | 12 +- fdbrpc/FlowTransport.actor.cpp | 42 +++---- fdbrpc/FlowTransport.h | 4 +- fdbrpc/LoadBalance.actor.h | 2 +- fdbrpc/batcher.actor.h | 2 +- fdbrpc/fdbrpc.h | 28 ++--- fdbrpc/genericactors.actor.h | 2 +- fdbrpc/sim2.actor.cpp | 48 ++++---- fdbrpc/simulator.h | 4 +- fdbserver/ClusterController.actor.cpp | 4 +- fdbserver/ClusterRecruitmentInterface.h | 14 +-- fdbserver/Coordination.actor.cpp | 10 +- fdbserver/CoroFlow.actor.cpp | 2 +- fdbserver/DataDistribution.actor.cpp | 40 +++---- fdbserver/DataDistributionQueue.actor.cpp | 22 ++-- fdbserver/DataDistributionTracker.actor.cpp | 16 +-- fdbserver/KeyValueStoreSQLite.actor.cpp | 8 +- fdbserver/LeaderElection.actor.cpp | 8 +- fdbserver/LogRouter.actor.cpp | 16 +-- fdbserver/LogSystem.h | 12 +- fdbserver/LogSystemPeekCursor.actor.cpp | 22 ++-- fdbserver/MasterInterface.h | 2 +- fdbserver/MasterProxyServer.actor.cpp | 22 ++-- fdbserver/MoveKeys.actor.cpp | 24 ++-- fdbserver/OldTLogServer_4_6.actor.cpp | 44 ++++---- fdbserver/OldTLogServer_6_0.actor.cpp | 58 +++++----- fdbserver/Orderer.actor.h | 4 +- fdbserver/Ratekeeper.actor.cpp | 4 +- fdbserver/Resolver.actor.cpp | 6 +- fdbserver/ResolverInterface.h | 4 +- fdbserver/RestoreInterface.h | 2 +- fdbserver/SimulatedCluster.actor.cpp | 4 +- fdbserver/Status.actor.cpp | 2 +- fdbserver/TLogInterface.h | 10 +- fdbserver/TLogServer.actor.cpp | 62 +++++------ fdbserver/TagPartitionedLogSystem.actor.cpp | 4 +- fdbserver/VFSAsync.cpp | 2 +- fdbserver/WaitFailure.actor.cpp | 6 +- fdbserver/WaitFailure.h | 8 +- fdbserver/WorkerInterface.actor.h | 2 +- fdbserver/fdbserver.actor.cpp | 2 +- fdbserver/masterserver.actor.cpp | 10 +- fdbserver/networktest.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 34 +++--- fdbserver/worker.actor.cpp | 6 +- flow/IThreadPool.h | 6 +- flow/Net2.actor.cpp | 68 ++++++------ flow/Profiler.actor.cpp | 2 +- flow/ThreadHelper.actor.h | 6 +- flow/Trace.cpp | 2 +- flow/flow.h | 14 +-- flow/genericactors.actor.h | 18 +-- flow/network.h | 115 +++++++++++--------- 72 files changed, 531 insertions(+), 522 deletions(-) diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 96512a0ce4..99af1a665e 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -85,7 +85,7 @@ void fdb_flow_test() { openTraceFile(NetworkAddress(), 1000000, 1000000, "."); systemMonitor(); - uncancellable(recurring(&systemMonitor, 5.0, TaskFlushTrace)); + uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace)); Future t = _test(); @@ -179,7 +179,7 @@ namespace FDB { } void backToFutureCallback( FDBFuture* f, void* data ) { - g_network->onMainThread( Promise((SAV*)data), TaskDefaultOnMainThread ); // SOMEDAY: think about this priority + g_network->onMainThread( Promise((SAV*)data), TaskPriority::DefaultOnMainThread ); // SOMEDAY: think about this priority } // backToFuture( FDBFuture*, (FDBFuture* -> Type) ) -> Future diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 1de08c64f8..25bc58c71d 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -419,7 +419,7 @@ ACTOR Future readCommitted(Database cx, PromiseStreamtake(TaskDefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT)); + wait(lock->take(TaskPriority::DefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT)); releaser = FlowLock::Releaser(*lock, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT); state Standalone values = wait(tr.getRange(begin, end, limits)); @@ -495,7 +495,7 @@ ACTOR Future readCommitted(Database cx, PromiseStream results, Fu //add lock wait(active); releaser.release(); - wait(lock->take(TaskDefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize())); + wait(lock->take(TaskPriority::DefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize())); releaser = FlowLock::Releaser(*lock, rangevalue.expectedSize() + rcGroup.items.expectedSize()); for (auto & s : rangevalue){ @@ -613,7 +613,7 @@ ACTOR Future dumpData(Database cx, PromiseStream results, Referenc req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; totalBytes += mutationSize; - wait( commitLock->take(TaskDefaultYield, mutationSize) ); + wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) ); addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) ); if(endOfStream) { @@ -653,7 +653,7 @@ ACTOR Future coalesceKeyVersionCache(Key uid, Version endVersion, Referenc req.transaction.read_snapshot = committedVersion->get(); req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; - wait( commitLock->take(TaskDefaultYield, mutationSize) ); + wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) ); addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) ); } @@ -671,7 +671,7 @@ ACTOR Future applyMutations(Database cx, Key uid, Key addPrefix, Key remov try { loop { if(beginVersion >= *endVersion) { - wait( commitLock.take(TaskDefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) ); + wait( commitLock.take(TaskPriority::DefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) ); commitLock.release(CLIENT_KNOBS->BACKUP_LOCK_BYTES); if(beginVersion >= *endVersion) { return Void(); diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h index bb51ce74f2..5e17807c4d 100644 --- a/fdbclient/ClusterInterface.h +++ b/fdbclient/ClusterInterface.h @@ -52,12 +52,12 @@ struct ClusterInterface { } void initEndpoints() { - openDatabase.getEndpoint( TaskClusterController ); - failureMonitoring.getEndpoint( TaskFailureMonitor ); - databaseStatus.getEndpoint( TaskClusterController ); - ping.getEndpoint( TaskClusterController ); - getClientWorkers.getEndpoint( TaskClusterController ); - forceRecovery.getEndpoint( TaskClusterController ); + openDatabase.getEndpoint( TaskPriority::ClusterController ); + failureMonitoring.getEndpoint( TaskPriority::FailureMonitor ); + databaseStatus.getEndpoint( TaskPriority::ClusterController ); + ping.getEndpoint( TaskPriority::ClusterController ); + getClientWorkers.getEndpoint( TaskPriority::ClusterController ); + forceRecovery.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 0245c2abdb..606952fb9c 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -54,7 +54,7 @@ public: // For internal (fdbserver) use only static Database create( Reference>> clusterInterface, Reference connFile, LocalityData const& clientLocality ); - static Database create( Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID=TaskDefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST ); + static Database create( Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID=TaskPriority::DefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST ); ~DatabaseContext(); @@ -97,7 +97,7 @@ public: //private: explicit DatabaseContext( Reference cluster, Reference> clientDBInfo, - Future clientInfoMonitor, Standalone dbId, int taskID, LocalityData const& clientLocality, + Future clientInfoMonitor, Standalone dbId, TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST ); explicit DatabaseContext( const Error &err ); @@ -161,7 +161,7 @@ public: Future logger; - int taskID; + TaskPriority taskID; Int64MetricHandle getValueSubmitted; EventMetricHandle getValueCompleted; diff --git a/fdbclient/FailureMonitorClient.actor.cpp b/fdbclient/FailureMonitorClient.actor.cpp index 3be7a4dccd..7cb1a3144e 100644 --- a/fdbclient/FailureMonitorClient.actor.cpp +++ b/fdbclient/FailureMonitorClient.actor.cpp @@ -41,7 +41,7 @@ ACTOR Future failureMonitorClientLoop( { state Version version = 0; state Future request = Never(); - state Future nextRequest = delay(0, TaskFailureMonitor); + state Future nextRequest = delay(0, TaskPriority::FailureMonitor); state Future requestTimeout = Never(); state double before = now(); state double waitfor = 0; @@ -61,7 +61,7 @@ ACTOR Future failureMonitorClientLoop( loop { choose { when( FailureMonitoringReply reply = wait( request ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); request = Never(); requestTimeout = Never(); if (reply.allOthersFailed) { @@ -122,10 +122,10 @@ ACTOR Future failureMonitorClientLoop( } before = now(); waitfor = reply.clientRequestIntervalMS * .001; - nextRequest = delayJittered( waitfor, TaskFailureMonitor ); + nextRequest = delayJittered( waitfor, TaskPriority::FailureMonitor ); } when( wait( requestTimeout ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); requestTimeout = Never(); TraceEvent(SevWarn, "FailureMonitoringServerDown").detail("OldServerID",controller.id()); monitor->setStatus(controlAddr.address, FailureStatus(true)); @@ -136,7 +136,7 @@ ACTOR Future failureMonitorClientLoop( } } when( wait( nextRequest ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); nextRequest = Never(); double elapsed = now() - before; @@ -152,9 +152,9 @@ ACTOR Future failureMonitorClientLoop( req.addresses = g_network->getLocalAddresses(); if (trackMyStatus) req.senderStatus = FailureStatus(false); - request = controller.failureMonitoring.getReply( req, TaskFailureMonitor ); + request = controller.failureMonitoring.getReply( req, TaskPriority::FailureMonitor ); if(!controller.failureMonitoring.getEndpoint().isLocal()) - requestTimeout = delay( fmState->serverFailedTimeout, TaskFailureMonitor ); + requestTimeout = delay( fmState->serverFailedTimeout, TaskPriority::FailureMonitor ); } } } diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index 00cece10a1..5893588406 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -93,7 +93,7 @@ namespace HTTP { loop { // Wait for connection to have something to read wait(conn->onReadable()); - wait( delay( 0, TaskReadSocket ) ); + wait( delay( 0, TaskPriority::ReadSocket ) ); // Read into buffer int originalSize = buf->size(); @@ -353,7 +353,7 @@ namespace HTTP { loop { wait(conn->onWritable()); - wait( delay( 0, TaskWriteSocket ) ); + wait( delay( 0, TaskPriority::WriteSocket ) ); // If we already got a response, before finishing sending the request, then close the connection, // set the Connection header to "close" as a hint to the caller that this connection can't be used diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index a371ac2624..afc64d62c2 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -967,7 +967,7 @@ ACTOR Future changeQuorum( Database cx, Reference>> leaderServers; ClientCoordinators coord( Reference( new ClusterConnectionFile( conn ) ) ); for( int i = 0; i < coord.clientLeaderServers.size(); i++ ) - leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) ); + leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) ); choose { when( wait( waitForAll( leaderServers ) ) ) {} @@ -1047,7 +1047,7 @@ struct AutoQuorumChange : IQuorumChange { ClientCoordinators coord(ccf); vector>> leaderServers; for( int i = 0; i < coord.clientLeaderServers.size(); i++ ) - leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) ); + leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) ); Optional>> results = wait( timeout( getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY ) ); if (!results.present()) return false; // Not all responded for(auto& r : results.get()) diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index 9b65ec572c..dea0d8b797 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -67,10 +67,10 @@ struct MasterProxyInterface { } void initEndpoints() { - getConsistentReadVersion.getEndpoint(TaskProxyGetConsistentReadVersion); - getRawCommittedVersion.getEndpoint(TaskProxyGetRawCommittedVersion); - commit.getEndpoint(TaskProxyCommitDispatcher); - getStorageServerRejoinInfo.getEndpoint(TaskProxyStorageRejoin); + getConsistentReadVersion.getEndpoint(TaskPriority::ProxyGetConsistentReadVersion); + getRawCommittedVersion.getEndpoint(TaskPriority::ProxyGetRawCommittedVersion); + commit.getEndpoint(TaskPriority::ProxyCommitDispatcher); + getStorageServerRejoinInfo.getEndpoint(TaskPriority::ProxyStorageRejoin); //getKeyServersLocations.getEndpoint(TaskProxyGetKeyServersLocations); //do not increase the priority of these requests, because clients cans bring down the cluster with too many of these messages. } }; diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 6210eb8810..b066b03b13 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -371,7 +371,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( NetworkAddress remote ) } ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) { - getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskCoordination ); + getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination ); } // Nominee is the worker among all workers that are considered as leader by a coordinator @@ -380,7 +380,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) { ACTOR Future monitorNominee( Key key, ClientLeaderRegInterface coord, AsyncTrigger* nomineeChange, Optional *info, int generation, Reference> connectedCoordinatorsNum ) { state bool hasCounted = false; loop { - state Optional li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskCoordinationReply ) ); + state Optional li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskPriority::CoordinationReply ) ); if (li.present() && !hasCounted && connectedCoordinatorsNum.isValid()) { connectedCoordinatorsNum->set(connectedCoordinatorsNum->get() + 1); hasCounted = true; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6fbf778997..38b373c954 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -509,7 +509,7 @@ Future DatabaseContext::getHealthMetrics(bool detailed = false) { DatabaseContext::DatabaseContext( Reference cluster, Reference> clientInfo, Future clientInfoMonitor, Standalone dbId, - int taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) + TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) : cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware), apiVersion(apiVersion), provisional(false), transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), @@ -629,10 +629,10 @@ Database DatabaseContext::create(Reference>> Reference> clientInfo(new AsyncVar()); Future clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed); - return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false)); + return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false)); } -Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID, bool lockAware, int apiVersion) { +Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) { return Database( new DatabaseContext( Reference(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) ); } @@ -820,10 +820,10 @@ Database Database::createDatabase( Reference connFile, in DatabaseContext *db; if(preallocatedDb) { - db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion); + db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); } else { - db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion); + db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); } return Database(db); @@ -879,7 +879,7 @@ void Cluster::init( Reference connFile, bool startClientI initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP))); systemMonitor(); - uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskFlushTrace ) ); + uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace ) ); } failMon = failureMonitorClient( clusterInterface, false ); @@ -1235,7 +1235,7 @@ ACTOR Future< pair> > getKeyLocation_internal( loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskDefaultPromiseEndpoint ) ) ) { + when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT( rep.results.size() == 1 ); @@ -1272,7 +1272,7 @@ ACTOR Future< vector< pair> > > getKeyRangeLoca loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskDefaultPromiseEndpoint ) ) ) { + when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { state GetKeyServerLocationsReply rep = _rep; if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After"); @@ -1393,7 +1393,7 @@ ACTOR Future> getValue( Future version, Key key, Databa } state GetValueReply reply = wait( loadBalance(ssi.second, &StorageServerInterface::getValue, GetValueRequest(key, ver, getValueID), - TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL)); + TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL)); double latency = now() - startTimeD; cx->readLatencies.addSample(latency); if (trLogInfo) { @@ -1456,7 +1456,7 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); ++cx->transactionPhysicalReads; - GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; @@ -1519,7 +1519,7 @@ ACTOR Future< Void > watchValue( Future version, Key key, OptionalgetCurrentTask()); } - state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskDefaultPromiseEndpoint ) ); + state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskPriority::DefaultPromiseEndpoint ) ); if( info.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask()); } @@ -1611,7 +1611,7 @@ ACTOR Future> getExactRange( Database cx, Version ver .detail("Servers", locations[shard].second->description());*/ } ++cx->transactionPhysicalReads; - GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After"); output.arena().dependsOn( rep.arena ); @@ -1888,7 +1888,7 @@ ACTOR Future> getRange( Database cx, ReferenceenableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getRange.After");//.detail("SizeOf", rep.data.size()); @@ -2698,7 +2698,7 @@ ACTOR static Future tryCommit( Database cx, Reference const std::vector& proxies = cx->clientInfo->get().proxies; reply = proxies.size() ? throwErrorOr ( brokenPromiseToMaybeDelivered ( proxies[0].commit.tryGetReply(req) ) ) : Never(); } else { - reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskDefaultPromiseEndpoint, true ); + reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true ); } choose { @@ -3073,7 +3073,7 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStream< std::p if (requests.size() == CLIENT_KNOBS->MAX_BATCH_SIZE) send_batch = true; else if (!timeout.isValid()) - timeout = delay(batchTime, TaskProxyGetConsistentReadVersion); + timeout = delay(batchTime, TaskPriority::ProxyGetConsistentReadVersion); } when(wait(timeout.isValid() ? timeout : Never())) { send_batch = true; @@ -3240,7 +3240,7 @@ ACTOR Future< StorageMetrics > waitStorageMetricsMultipleLocations( WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics()); req.min.bytes = 0; req.max.bytes = -1; - fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution ); + fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution ); } wait( waitForAll(fx) ); @@ -3271,7 +3271,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( int shardLimit ) { loop { - vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskDataDistribution) ) ); + vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) ); //SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this. if(locations.size() < shardLimit) { @@ -3281,7 +3281,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( fx = waitStorageMetricsMultipleLocations( locations, min, max, permittedError ); } else { WaitMetricsRequest req( keys, min, max ); - fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution ); + fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution ); } StorageMetrics x = wait(fx); return x; @@ -3291,14 +3291,14 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( throw; } cx->invalidateCache(keys); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } else { TraceEvent(SevWarn, "WaitStorageMetricsPenalty") .detail("Keys", keys) .detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY); - wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution)); + wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); // make sure that the next getKeyRangeLocations() call will actually re-fetch the range cx->invalidateCache( keys ); } @@ -3324,13 +3324,13 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i ACTOR Future< Standalone> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated ) { loop { - state vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskDataDistribution) ) ); + state vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) ); state StorageMetrics used; state Standalone> results; //SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this. if(locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) { - wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); cx->invalidateCache(keys); } else { @@ -3341,7 +3341,7 @@ ACTOR Future< Standalone> > splitStorageMetrics( Database cx, state int i = 0; for(; i> > splitStorageMetrics( Database cx, throw; } cx->invalidateCache( keys ); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 0f59d368c5..e4310e9721 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -164,10 +164,10 @@ struct TransactionOptions { struct TransactionInfo { Optional debugID; - int taskID; + TaskPriority taskID; bool useProvisionalProxies; - explicit TransactionInfo( int taskID ) : taskID(taskID), useProvisionalProxies(false) {} + explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {} }; struct TransactionLogInfo : public ReferenceCounted, NonCopyable { @@ -287,7 +287,7 @@ public: void flushTrLogsIfEnabled(); // These are to permit use as state variables in actors: - Transaction() : info( TaskDefaultEndpoint ) {} + Transaction() : info( TaskPriority::DefaultEndpoint ) {} void operator=(Transaction&& r) BOOST_NOEXCEPT; void reset(); diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp index d4b06a5182..8e706987a9 100644 --- a/fdbclient/StatusClient.actor.cpp +++ b/fdbclient/StatusClient.actor.cpp @@ -291,7 +291,7 @@ ACTOR Future> clientCoordinatorsStatusFetcher(Reference>> leaderServers; for (int i = 0; i < coord.clientLeaderServers.size(); i++) - leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskCoordinationReply)); + leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); wait( smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.5) || delay(2.0) ); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 6225fd50f7..ebc880f8ce 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -80,9 +80,9 @@ struct StorageServerInterface { bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; } bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; } void initEndpoints() { - getValue.getEndpoint( TaskLoadBalancedEndpoint ); - getKey.getEndpoint( TaskLoadBalancedEndpoint ); - getKeyValues.getEndpoint( TaskLoadBalancedEndpoint ); + getValue.getEndpoint( TaskPriority::LoadBalancedEndpoint ); + getKey.getEndpoint( TaskPriority::LoadBalancedEndpoint ); + getKeyValues.getEndpoint( TaskPriority::LoadBalancedEndpoint ); } }; diff --git a/fdbclient/VersionedMap.actor.h b/fdbclient/VersionedMap.actor.h index cfb9e650f6..953c2f4c1f 100644 --- a/fdbclient/VersionedMap.actor.h +++ b/fdbclient/VersionedMap.actor.h @@ -31,7 +31,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. ACTOR template -Future deferredCleanupActor( std::vector toFree, int taskID = 7000 ) { +Future deferredCleanupActor( std::vector toFree, TaskPriority taskID = 7000 ) { state int freeCount = 0; while (!toFree.empty()) { Tree a = std::move( toFree.back() ); diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h index 705108ce72..58c440c679 100644 --- a/fdbclient/VersionedMap.h +++ b/fdbclient/VersionedMap.h @@ -511,7 +511,7 @@ public: oldestVersion = newOldestVersion; } - Future forgetVersionsBeforeAsync( Version newOldestVersion, int taskID = 7000 ) { + Future forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = 7000 ) { ASSERT( newOldestVersion <= latestVersion ); roots[newOldestVersion] = getRoot(newOldestVersion); diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index 12ca1866ad..f786266888 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -266,7 +266,7 @@ private: } ACTOR static Future read_impl( int fd, void* data, int length, int64_t offset ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; //fprintf(stderr, "eio_read (fd=%d length=%d offset=%lld)\n", fd, length, offset); state eio_req* r = eio_read(fd, data, length, offset, 0, eio_callback, &p); @@ -289,7 +289,7 @@ private: } ACTOR static Future write_impl( int fd, Reference err, StringRef data, int64_t offset ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_write(fd, (void*)data.begin(), data.size(), offset, 0, eio_callback, &p); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -299,7 +299,7 @@ private: } ACTOR static Future truncate_impl( int fd, Reference err, int64_t size ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_ftruncate(fd, size, 0, eio_callback, &p); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -330,7 +330,7 @@ private: } ACTOR static Future sync_impl( int fd, Reference err, bool sync_metadata=false ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = start_fsync( fd, p, sync_metadata ); @@ -350,7 +350,7 @@ private: } ACTOR static Future size_impl( int fd ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_fstat( fd, 0, eio_callback, &p ); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -363,7 +363,7 @@ private: } ACTOR static Future stat_impl( std::string filename ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state EIO_STRUCT_STAT statdata; state eio_req* r = eio_stat( filename.c_str(), 0, eio_callback, &p ); @@ -377,7 +377,7 @@ private: ACTOR template static Future dispatch_impl( std::function func) { state Dispatch data( func ); - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state eio_req* r = eio_custom( [](eio_req* req) { // Runs on the eio thread pool @@ -418,7 +418,7 @@ private: static void eio_want_poll() { want_poll = 1; // SOMEDAY: NULL for deferred error, no analysis of correctness (itp) - onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPollEIO); + onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPriority::PollEIO); } static int eio_callback( eio_req* req ) { diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index ac66605be3..14495a6cdf 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -472,9 +472,9 @@ private: #endif } - int getTask() const { return (prio>>32)+1; } + TaskPriority getTask() const { return static_cast((prio>>32)+1); } - ACTOR static void deliver( Promise result, bool failed, int r, int task ) { + ACTOR static void deliver( Promise result, bool failed, int r, TaskPriority task ) { wait( delay(0, task) ); if (failed) result.sendError(io_timeout()); else if (r < 0) result.sendError(io_error()); @@ -649,7 +649,7 @@ private: loop { wait(success(ev->read())); - wait(delay(0, TaskDiskIOComplete)); + wait(delay(0, TaskPriority::DiskIOComplete)); linux_ioresult ev[FLOW_KNOBS->MAX_OUTSTANDING]; timespec tm; tm.tv_sec = 0; tm.tv_nsec = 0; diff --git a/fdbrpc/AsyncFileNonDurable.actor.cpp b/fdbrpc/AsyncFileNonDurable.actor.cpp index a3257f1fa8..6ea0129a27 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.cpp +++ b/fdbrpc/AsyncFileNonDurable.actor.cpp @@ -23,13 +23,13 @@ std::map> AsyncFileNonDurable::filesBeingDeleted; -ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, int taskID ) { +ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID ) { wait( g_simulator.onProcess( process, taskID ) ); promise.send(Void()); return Void(); } -ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, int taskID ) { +ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, TaskPriority taskID ) { wait( g_simulator.onProcess( process, taskID ) ); promise.sendError(e); return Void(); diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 03fe8e852c..7e8e551b3e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -38,8 +38,8 @@ #undef max #undef min -Future sendOnProcess( ISimulator::ProcessInfo* const& process, Promise const& promise, int const& taskID ); -Future sendErrorOnProcess( ISimulator::ProcessInfo* const& process, Promise const& promise, Error const& e, int const& taskID ); +ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID ); +ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, TaskPriority taskID ); ACTOR template Future sendErrorOnShutdown( Future in ) { @@ -198,7 +198,7 @@ public: //Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile ACTOR static Future> open(std::string filename, std::string actualFilename, Future> wrappedFile, Reference diskParameters) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); state Future shutdown = success(currentProcess->shutdownSignal.getFuture()); //TraceEvent("AsyncFileNonDurableOpenBegin").detail("Filename", filename).detail("Addr", g_simulator.getCurrentProcess()->address); @@ -391,7 +391,7 @@ private: ACTOR Future read(AsyncFileNonDurable *self, void *data, int length, int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { @@ -411,7 +411,7 @@ private: //or none of the write. It may also corrupt parts of sectors which have not been written correctly ACTOR Future write(AsyncFileNonDurable *self, Promise writeStarted, Future> ownFuture, void const* data, int length, int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; @@ -535,7 +535,7 @@ private: //If a kill interrupts the delay, then the truncate may or may not be performed ACTOR Future truncate(AsyncFileNonDurable *self, Promise truncateStarted, Future> ownFuture, int64_t size) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; @@ -573,8 +573,8 @@ private: } } - if(g_network->check_yield(TaskDefaultYield)) { - wait(delay(0, TaskDefaultYield)); + if(g_network->check_yield(TaskPriority::DefaultYield)) { + wait(delay(0, TaskPriority::DefaultYield)); } //If performing a durable truncate, then pass it through to the file. Otherwise, pass it through with a 1/2 chance @@ -663,7 +663,7 @@ private: ACTOR Future sync(AsyncFileNonDurable *self, bool durable) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { @@ -695,7 +695,7 @@ private: ACTOR Future size(AsyncFileNonDurable *self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); @@ -714,7 +714,7 @@ private: //Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it ACTOR Future deleteFile(AsyncFileNonDurable *self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); state std::string filename = self->filename; wait( g_simulator.onMachine( currentProcess ) ); diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index dabc9800f7..46ca17f8e7 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -172,28 +172,28 @@ struct YieldMockNetwork : INetwork, ReferenceCounted { t.send(Void()); } - virtual Future delay(double seconds, int taskID) { + virtual Future delay(double seconds, TaskPriority taskID) { return nextTick.getFuture(); } - virtual Future yield(int taskID) { + virtual Future yield(TaskPriority taskID) { if (check_yield(taskID)) return delay(0,taskID); return Void(); } - virtual bool check_yield(int taskID) { + virtual bool check_yield(TaskPriority taskID) { if (nextYield > 0) --nextYield; return nextYield == 0; } // Delegate everything else. TODO: Make a base class NetworkWrapper for delegating everything in INetwork - virtual int getCurrentTask() { return baseNetwork->getCurrentTask(); } - virtual void setCurrentTask(int taskID) { baseNetwork->setCurrentTask(taskID); } + virtual TaskPriority getCurrentTask() { return baseNetwork->getCurrentTask(); } + virtual void setCurrentTask(TaskPriority taskID) { baseNetwork->setCurrentTask(taskID); } virtual double now() { return baseNetwork->now(); } virtual void stop() { return baseNetwork->stop(); } virtual bool isSimulated() const { return baseNetwork->isSimulated(); } - virtual void onMainThread(Promise&& signal, int taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); } + virtual void onMainThread(Promise&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); } virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN(*func) (void *), void *arg) { return baseNetwork->startThread(func,arg); } virtual Future< Reference > open(std::string filename, int64_t flags, int64_t mode) { return IAsyncFileSystem::filesystem()->open(filename,flags,mode); } virtual Future< Void > deleteFile(std::string filename, bool mustBeDurable) { return IAsyncFileSystem::filesystem()->deleteFile(filename,mustBeDurable); } diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index b08ef9756a..ae709cd675 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -49,7 +49,7 @@ public: EndpointMap(); void insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority ); NetworkMessageReceiver* get( Endpoint::Token const& token ); - uint32_t getPriority( Endpoint::Token const& token ); + TaskPriority getPriority( Endpoint::Token const& token ); void remove( Endpoint::Token const& token, NetworkMessageReceiver* r ); private: @@ -99,11 +99,11 @@ NetworkMessageReceiver* EndpointMap::get( Endpoint::Token const& token ) { return 0; } -uint32_t EndpointMap::getPriority( Endpoint::Token const& token ) { +TaskPriority EndpointMap::getPriority( Endpoint::Token const& token ) { uint32_t index = token.second(); if ( index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second()&0xffffffff00000000LL)|index)==token.second() ) - return data[index].token().second(); - return TaskUnknownEndpoint; + return static_cast(data[index].token().second()); + return TaskPriority::UnknownEndpoint; } void EndpointMap::remove( Endpoint::Token const& token, NetworkMessageReceiver* r ) { @@ -119,7 +119,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver { EndpointNotFoundReceiver(EndpointMap& endpoints) { //endpoints[WLTOKEN_ENDPOINT_NOT_FOUND] = this; Endpoint::Token e = WLTOKEN_ENDPOINT_NOT_FOUND; - endpoints.insert(this, e, TaskDefaultEndpoint); + endpoints.insert(this, e, static_cast(TaskPriority::DefaultEndpoint)); ASSERT( e == WLTOKEN_ENDPOINT_NOT_FOUND ); } virtual void receive( ArenaReader& reader ) { @@ -138,7 +138,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver { struct PingReceiver : NetworkMessageReceiver { PingReceiver(EndpointMap& endpoints) { Endpoint::Token e = WLTOKEN_PING_PACKET; - endpoints.insert(this, e, TaskReadSocket); + endpoints.insert(this, e, static_cast(TaskPriority::ReadSocket)); ASSERT( e == WLTOKEN_PING_PACKET ); } virtual void receive( ArenaReader& reader ) { @@ -435,10 +435,10 @@ struct Peer : NonCopyable { ACTOR static Future connectionWriter( Peer* self, Reference conn ) { state double lastWriteTime = now(); loop { - //wait( delay(0, TaskWriteSocket) ); - wait( delayJittered(std::max(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskWriteSocket) ); - //wait( delay(500e-6, TaskWriteSocket) ); - //wait( yield(TaskWriteSocket) ); + //wait( delay(0, TaskPriority::WriteSocket) ); + wait( delayJittered(std::max(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskPriority::WriteSocket) ); + //wait( delay(500e-6, TaskPriority::WriteSocket) ); + //wait( yield(TaskPriority::WriteSocket) ); // Send until there is nothing left to send loop { @@ -453,7 +453,7 @@ struct Peer : NonCopyable { TEST(true); // We didn't write everything, so apparently the write buffer is full. Wait for it to be nonfull. wait( conn->onWritable() ); - wait( yield(TaskWriteSocket) ); + wait( yield(TaskPriority::WriteSocket) ); } // Wait until there is something to send @@ -599,8 +599,8 @@ TransportData::~TransportData() { } ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader reader, bool inReadSocket) { - int priority = self->endpoints.getPriority(destination.token); - if (priority < TaskReadSocket || !inReadSocket) { + TaskPriority priority = self->endpoints.getPriority(destination.token); + if (priority < TaskPriority::ReadSocket || !inReadSocket) { wait( delay(0, priority) ); } else { g_network->setCurrentTask( priority ); @@ -634,7 +634,7 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader } if( inReadSocket ) - g_network->setCurrentTask( TaskReadSocket ); + g_network->setCurrentTask( TaskPriority::ReadSocket ); } static void scanPackets(TransportData* transport, uint8_t*& unprocessed_begin, uint8_t* e, Arena& arena, @@ -884,11 +884,11 @@ ACTOR static Future connectionReader( if (readWillBlock) break; - wait(yield(TaskReadSocket)); + wait(yield(TaskPriority::ReadSocket)); } wait( conn->onReadable() ); - wait(delay(0, TaskReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time + wait(delay(0, TaskPriority::ReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time } } catch (Error& e) { @@ -932,7 +932,7 @@ ACTOR static Future listen( TransportData* self, NetworkAddress listenAddr .detail("FromAddress", conn->getPeerAddress()) .detail("ListenAddress", listenAddr.toString()); incoming.add( connectionIncoming(self, conn) ); - wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskWriteSocket)); + wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskPriority::WriteSocket)); } } catch (Error& e) { TraceEvent(SevError, "ListenError").error(e); @@ -1054,7 +1054,7 @@ void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessag } } -void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) { +void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) { endpoint.token = deterministicRandom()->randomUniqueID(); if (receiver->isStream()) { endpoint.addresses = self->localAddresses; @@ -1063,18 +1063,18 @@ void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* rec endpoint.addresses = NetworkAddressList(); endpoint.token = UID( endpoint.token.first() & ~TOKEN_STREAM_FLAG, endpoint.token.second() ); } - self->endpoints.insert( receiver, endpoint.token, taskID ); + self->endpoints.insert( receiver, endpoint.token, static_cast(taskID) ); } void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) { self->endpoints.remove(endpoint.token, receiver); } -void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) { +void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) { endpoint.addresses = self->localAddresses; ASSERT( ((endpoint.token.first() & TOKEN_STREAM_FLAG)!=0) == receiver->isStream() ); Endpoint::Token otoken = endpoint.token; - self->endpoints.insert( receiver, endpoint.token, taskID ); + self->endpoints.insert( receiver, endpoint.token, static_cast(taskID) ); ASSERT( endpoint.token == otoken ); } diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index 827d2727e6..d1be8c3411 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -137,13 +137,13 @@ public: void removePeerReference( const Endpoint&, NetworkMessageReceiver* ); // Signal that a peer connection is no longer being used - void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID ); + void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); // Sets endpoint to be a new local endpoint which delivers messages to the given receiver void removeEndpoint( const Endpoint&, NetworkMessageReceiver* ); // The given local endpoint no longer delivers messages to the given receiver or uses resources - void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID ); + void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); // Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver // Implementations may have limitations on when this function is called and what endpoint.token may be! diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 557759d9a5..903a197f58 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -178,7 +178,7 @@ Future< REPLY_TYPE(Request) > loadBalance( Reference> alternatives, RequestStream Interface::* channel, Request request = Request(), - int taskID = TaskDefaultPromiseEndpoint, + TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = NULL) { diff --git a/fdbrpc/batcher.actor.h b/fdbrpc/batcher.actor.h index 7e276ad574..72a9bc9094 100644 --- a/fdbrpc/batcher.actor.h +++ b/fdbrpc/batcher.actor.h @@ -47,7 +47,7 @@ bool firstInBatch(CommitTransactionRequest x) { } ACTOR template -Future batcher(PromiseStream, int> > out, FutureStream in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, int taskID = TaskDefaultDelay, Counter* counter = 0) +Future batcher(PromiseStream, int> > out, FutureStream in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, TaskPriority taskID = TaskPriority::DefaultDelay, Counter* counter = 0) { wait( delayJittered(*avgMaxDelay, taskID) ); // smooth out // This is set up to deliver even zero-size batches if emptyBatchTimeout elapses, because that's what master proxy wants. The source control history diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 9853cbe968..470cec10d9 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -48,7 +48,7 @@ struct FlowReceiver : private NetworkMessageReceiver { // If already a remote endpoint, returns that. Otherwise makes this // a local endpoint and returns that. - const Endpoint& getEndpoint(int taskID) { + const Endpoint& getEndpoint(TaskPriority taskID) { if (!endpoint.isValid()) { m_isLocalEndpoint = true; FlowTransport::transport().addEndpoint(endpoint, this, taskID); @@ -56,7 +56,7 @@ struct FlowReceiver : private NetworkMessageReceiver { return endpoint; } - void makeWellKnownEndpoint(Endpoint::Token token, int taskID) { + void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) { ASSERT(!endpoint.isValid()); m_isLocalEndpoint = true; endpoint.token = token; @@ -128,7 +128,7 @@ public: ~ReplyPromise() { if (sav) sav->delPromiseRef(); } ReplyPromise(const Endpoint& endpoint) : sav(new NetSAV(0, 1, endpoint)) {} - const Endpoint& getEndpoint(int taskID = TaskDefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); } + const Endpoint& getEndpoint(TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); } void operator=(const ReplyPromise& rhs) { if (rhs.sav) rhs.sav->addPromiseRef(); @@ -204,19 +204,19 @@ template void resetReply(ReplyPromise & p) { p.reset(); } template -void resetReply(Request& r, int taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); } +void resetReply(Request& r, TaskPriority taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); } template -void resetReply(ReplyPromise & p, int taskID) { p.reset(); p.getEndpoint(taskID); } +void resetReply(ReplyPromise & p, TaskPriority taskID) { p.reset(); p.getEndpoint(taskID); } template -void setReplyPriority(Request& r, int taskID) { r.reply.getEndpoint(taskID); } +void setReplyPriority(Request& r, TaskPriority taskID) { r.reply.getEndpoint(taskID); } template -void setReplyPriority(ReplyPromise & p, int taskID) { p.getEndpoint(taskID); } +void setReplyPriority(ReplyPromise & p, TaskPriority taskID) { p.getEndpoint(taskID); } template -void setReplyPriority(const ReplyPromise & p, int taskID) { p.getEndpoint(taskID); } +void setReplyPriority(const ReplyPromise & p, TaskPriority taskID) { p.getEndpoint(taskID); } @@ -281,7 +281,7 @@ public: return reportEndpointFailure(getReplyPromise(value).getFuture(), getEndpoint()); } template - Future getReply(const X& value, int taskID) const { + Future getReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); return getReply(value); } @@ -290,7 +290,7 @@ public: return getReply(ReplyPromise()); } template - Future getReplyWithTaskID(int taskID) const { + Future getReplyWithTaskID(TaskPriority taskID) const { ReplyPromise reply; reply.getEndpoint(taskID); return getReply(reply); @@ -302,7 +302,7 @@ public: // If cancelled or returns failure, request was or will be delivered zero or one times. // The caller must be capable of retrying if this request returns failure template - Future> tryGetReply(const X& value, int taskID) const { + Future> tryGetReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); if (queue->isRemoteEndpoint()) { Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID)); @@ -344,7 +344,7 @@ public: // If it returns failure, the failure detector considers the endpoint failed permanently or for the given amount of time // See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters. template - Future> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const { + Future> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, TaskPriority taskID) const { // If it is local endpoint, no need for failure monitoring return waitValueOrSignal(getReply(value, taskID), makeDependent(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope), @@ -388,8 +388,8 @@ public: //queue = (NetNotifiedQueue*)0xdeadbeef; } - Endpoint getEndpoint(int taskID = TaskDefaultEndpoint) const { return queue->getEndpoint(taskID); } - void makeWellKnownEndpoint(Endpoint::Token token, int taskID) { + Endpoint getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const { return queue->getEndpoint(taskID); } + void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) { queue->makeWellKnownEndpoint(token, taskID); } diff --git a/fdbrpc/genericactors.actor.h b/fdbrpc/genericactors.actor.h index 810ccdb731..744abaeebe 100644 --- a/fdbrpc/genericactors.actor.h +++ b/fdbrpc/genericactors.actor.h @@ -50,7 +50,7 @@ Future retryBrokenPromise( RequestStream to, Req request ) } ACTOR template -Future retryBrokenPromise( RequestStream to, Req request, int taskID ) { +Future retryBrokenPromise( RequestStream to, Req request, TaskPriority taskID ) { // Like to.getReply(request), except that a broken_promise exception results in retrying request immediately. // Suitable for use with well known endpoints, which are likely to return to existence after the other process restarts. // Not normally useful for ordinary endpoints, which conventionally are permanently destroyed after replying with broken_promise. diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 04aa0684ba..7b0547e922 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -422,7 +422,7 @@ public: ACTOR static Future> open( std::string filename, int flags, int mode, Reference diskParameters = Reference(new DiskParameters(25000, 150000000)), bool delayOnWrite = true ) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); if(++openCount >= 3000) { TraceEvent(SevError, "TooManyFiles"); @@ -741,11 +741,11 @@ public: // Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time virtual double now() { return time; } - virtual Future delay( double seconds, int taskID ) { - ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority); + virtual Future delay( double seconds, TaskPriority taskID ) { + ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); return delay( seconds, taskID, currentProcess ); } - Future delay( double seconds, int taskID, ProcessInfo* machine ) { + Future delay( double seconds, TaskPriority taskID, ProcessInfo* machine ) { ASSERT( seconds >= -0.0001 ); seconds = std::max(0.0, seconds); Future f; @@ -760,13 +760,13 @@ public: return f; } - ACTOR static Future checkShutdown(Sim2 *self, int taskID) { + ACTOR static Future checkShutdown(Sim2 *self, TaskPriority taskID) { wait(success(self->getCurrentProcess()->shutdownSignal.getFuture())); self->setCurrentTask(taskID); return Void(); } - virtual Future yield( int taskID ) { - if (taskID == TaskDefaultYield) taskID = currentTaskID; + virtual Future yield( TaskPriority taskID ) { + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (check_yield(taskID)) { // We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but // don't want to prevent instantaneous shutdown of "rebooted" machines. @@ -775,7 +775,7 @@ public: setCurrentTask(taskID); return Void(); } - virtual bool check_yield( int taskID ) { + virtual bool check_yield( TaskPriority taskID ) { if (yielded) return true; if (--yield_limit <= 0) { yield_limit = deterministicRandom()->randomInt(1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does @@ -783,10 +783,10 @@ public: } return yielded = BUGGIFY_WITH_PROB(0.01); } - virtual int getCurrentTask() { + virtual TaskPriority getCurrentTask() { return currentTaskID; } - virtual void setCurrentTask(int taskID ) { + virtual void setCurrentTask(TaskPriority taskID ) { currentTaskID = taskID; } // Sets the taskID/priority of the current task, without yielding @@ -923,7 +923,7 @@ public: } if ( mustBeDurable || deterministicRandom()->random01() < 0.5 ) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { wait( ::delay(0.05 * deterministicRandom()->random01()) ); @@ -949,7 +949,7 @@ public: ACTOR static Future runLoop(Sim2 *self) { state ISimulator::ProcessInfo *callingMachine = self->currentProcess; while ( !self->isStopped ) { - wait( self->net2->yield(TaskDefaultYield) ); + wait( self->net2->yield(TaskPriority::DefaultYield) ); self->mutex.enter(); if( self->tasks.size() == 0 ) { @@ -1579,23 +1579,23 @@ public: machines.erase(machineId); } - Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(-1) { + Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) { // Not letting currentProcess be NULL eliminates some annoying special cases currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", ""); g_network = net2 = newNet2(false, true, objSerializer); Net2FileSystem::newFileSystem(); - check_yield(0); + check_yield(TaskPriority::Zero); } // Implementation struct Task { - int taskID; + TaskPriority taskID; double time; uint64_t stable; ProcessInfo* machine; Promise action; - Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Promise&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} - Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Future& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); } + Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} + Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); } Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {} void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; } Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {} @@ -1642,20 +1642,20 @@ public: } } - virtual void onMainThread( Promise&& signal, int taskID ) { + virtual void onMainThread( Promise&& signal, TaskPriority taskID ) { // This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread // or a thread created with g_network->startThread ASSERT(getCurrentProcess()); mutex.enter(); - ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority); + ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); tasks.push( Task( time, taskID, taskCount++, getCurrentProcess(), std::move(signal) ) ); mutex.leave(); } - virtual Future onProcess( ISimulator::ProcessInfo *process, int taskID ) { + virtual Future onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID ) { return delay( 0, taskID, process ); } - virtual Future onMachine( ISimulator::ProcessInfo *process, int taskID ) { + virtual Future onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID ) { if( process->machine == 0 ) return Void(); return delay( 0, taskID, process->machine->machineProcess ); @@ -1664,7 +1664,7 @@ public: //time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because //time should only be modified from the main thread. double time; - int currentTaskID; + TaskPriority currentTaskID; //taskCount is guarded by ISimulator::mutex uint64_t taskCount; @@ -1694,9 +1694,9 @@ void startNewSimulator(bool objSerializer) { } ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { - TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay); + TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriority::DefaultDelay", TaskPriority::DefaultDelay); - wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question + wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question try { ASSERT( kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || kt == ISimulator::RebootProcessAndDelete ); diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 81e3ecc4f6..403db9ce57 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -137,8 +137,8 @@ public: ProcessInfo* getProcess( Endpoint const& endpoint ) { return getProcessByAddress(endpoint.getPrimaryAddress()); } ProcessInfo* getCurrentProcess() { return currentProcess; } - virtual Future onProcess( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0; - virtual Future onMachine( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0; + virtual Future onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0; + virtual Future onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0; virtual ProcessInfo* newProcess(const char* name, IPAddress ip, uint16_t port, uint16_t listenPerProcess, LocalityData locality, ProcessClass startingClass, const char* dataFolder, diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 9fc12d502e..f4b07cdbe5 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -107,7 +107,7 @@ public: DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), clientInfo( new AsyncVar( ClientDBInfo() ) ), serverInfo( new AsyncVar( ServerDBInfo() ) ), - db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskDefaultEndpoint, true ) ) // SOMEDAY: Locality! + db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality! { } @@ -1171,7 +1171,7 @@ public: serverInfo.clusterInterface = ccInterface; serverInfo.myLocality = locality; db.serverInfo->set( serverInfo ); - cx = openDBOnServer(db.serverInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true); } ~ClusterControllerData() { diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index dc9b41e5a6..d8432c7d1e 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -63,13 +63,13 @@ struct ClusterControllerFullInterface { void initEndpoints() { clientInterface.initEndpoints(); - recruitFromConfiguration.getEndpoint( TaskClusterController ); - recruitRemoteFromConfiguration.getEndpoint( TaskClusterController ); - recruitStorage.getEndpoint( TaskClusterController ); - registerWorker.getEndpoint( TaskClusterController ); - getWorkers.getEndpoint( TaskClusterController ); - registerMaster.getEndpoint( TaskClusterController ); - getServerDBInfo.getEndpoint( TaskClusterController ); + recruitFromConfiguration.getEndpoint( TaskPriority::ClusterController ); + recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterController ); + recruitStorage.getEndpoint( TaskPriority::ClusterController ); + registerWorker.getEndpoint( TaskPriority::ClusterController ); + getWorkers.getEndpoint( TaskPriority::ClusterController ); + registerMaster.getEndpoint( TaskPriority::ClusterController ); + getServerDBInfo.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index b4e7283592..641ded30a0 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -52,8 +52,8 @@ GenerationRegInterface::GenerationRegInterface( NetworkAddress remote ) GenerationRegInterface::GenerationRegInterface( INetwork* local ) { - read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskCoordination ); - write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskCoordination ); + read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskPriority::Coordination ); + write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskPriority::Coordination ); } LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote) @@ -67,9 +67,9 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote) LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) : ClientLeaderRegInterface(local) { - candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskCoordination ); - leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskCoordination ); - forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskCoordination ); + candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskPriority::Coordination ); + leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskPriority::Coordination ); + forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskPriority::Coordination ); } ServerCoordinators::ServerCoordinators( Reference cf ) diff --git a/fdbserver/CoroFlow.actor.cpp b/fdbserver/CoroFlow.actor.cpp index af9b5ac565..22eaab2b0f 100644 --- a/fdbserver/CoroFlow.actor.cpp +++ b/fdbserver/CoroFlow.actor.cpp @@ -263,7 +263,7 @@ typedef WorkPool CoroPool; -ACTOR void coroSwitcher( Future what, int taskID, Coro* coro ) { +ACTOR void coroSwitcher( Future what, TaskPriority taskID, Coro* coro ) { try { // state double t = now(); wait(what); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index ef8c25b2b6..9fc6c04ccd 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -88,7 +88,7 @@ struct TCMachineInfo : public ReferenceCounted { ACTOR Future updateServerMetrics( TCServerInfo *server ) { state StorageServerInterface ssi = server->lastKnownInterface; - state Future> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch ); + state Future> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch ); state Future resetRequest = Never(); state Future> interfaceChanged( server->onInterfaceChanged ); state Future serverRemoved( server->onRemoved ); @@ -104,7 +104,7 @@ ACTOR Future updateServerMetrics( TCServerInfo *server ) { return Void(); } metricsRequest = Never(); - resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskDataDistributionLaunch ); + resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch ); } when( std::pair _ssi = wait( interfaceChanged ) ) { ssi = _ssi.first; @@ -120,7 +120,7 @@ ACTOR Future updateServerMetrics( TCServerInfo *server ) { } else { resetRequest = Never(); - metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch ); + metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch ); } } } @@ -635,9 +635,9 @@ struct DDTeamCollection : ReferenceCounted { shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()), badTeamRemover(Void()), redundantTeamRemover(Void()), configuration(configuration), readyToStart(readyToStart), clearHealthyZoneFuture(Void()), - checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)), + checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)), initialFailureReactionDelay( - delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)), + delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)), healthyTeamCount(0), storageServerSet(new LocalityMap()), initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), @@ -671,7 +671,7 @@ struct DDTeamCollection : ReferenceCounted { ACTOR static Future logOnCompletion( Future signal, DDTeamCollection* self ) { wait(signal); - wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution)); + wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution)); if(!self->primary || self->configuration.usableRegions == 1) { TraceEvent("DDTrackerStarting", self->distributorId) @@ -1919,7 +1919,7 @@ struct DDTeamCollection : ReferenceCounted { //Building teams can cause servers to become undesired, which can make teams unhealthy. //Let all of these changes get worked out before responding to the get team request - wait( delay(0, TaskDataDistributionLaunch) ); + wait( delay(0, TaskPriority::DataDistributionLaunch) ); return Void(); } @@ -2232,7 +2232,7 @@ ACTOR Future waitUntilHealthy(DDTeamCollection* self) { TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get()); wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange()); } - wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. + wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { return Void(); } @@ -2638,7 +2638,7 @@ ACTOR Future trackExcludedServers( DDTeamCollection* self ) { if (nchid != lastChangeID) break; - wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskDataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead + wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead tr = Transaction(self->cx); } catch (Error& e) { wait( tr.onError(e) ); @@ -2757,14 +2757,14 @@ ACTOR Future serverMetricsPolling( TCServerInfo *server) { state double lastUpdate = now(); loop { wait( updateServerMetrics( server ) ); - wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskDataDistributionLaunch ) ); + wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskPriority::DataDistributionLaunch ) ); lastUpdate = now(); } } //Returns the KeyValueStoreType of server if it is different from self->storeType ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) { - state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID(TaskDataDistribution))); + state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID(TaskPriority::DataDistribution))); if(type == self->configuration.storageServerStoreType && (self->includedDCs.empty() || std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()) ) wait(Future(Never())); @@ -2787,7 +2787,7 @@ ACTOR Future waitForAllDataRemoved( Database cx, UID serverID, Version add } // Wait for any change to the serverKeys for this server - wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) ); + wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution) ); tr.reset(); } catch (Error& e) { wait( tr.onError(e) ); @@ -2830,7 +2830,7 @@ ACTOR Future storageServerFailureTracker( ASSERT(!inHealthyZone); healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false)); } else if(!inHealthyZone) { - healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution); + healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskPriority::DataDistribution); } choose { when ( wait(healthChanged) ) { @@ -3120,7 +3120,7 @@ ACTOR Future monitorStorageServerRecruitment(DDTeamCollection* self) { loop { choose { when( wait( self->recruitingStream.onChange() ) ) {} - when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskDataDistribution) : Future(Never()) ) ) { break; } + when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution) : Future(Never()) ) ) { break; } } } TraceEvent("StorageServerRecruitment", self->distributorId) @@ -3147,12 +3147,12 @@ ACTOR Future initializeStorage( DDTeamCollection* self, RecruitStorageRepl self->recruitingIds.insert(interfaceId); self->recruitingLocalities.insert(candidateWorker.worker.address()); - state ErrorOr newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) ); + state ErrorOr newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskPriority::DataDistribution ) ); if(newServer.isError()) { TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError()); if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) ) throw newServer.getError(); - wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskDataDistribution) ); + wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution) ); } self->recruitingIds.erase(interfaceId); self->recruitingLocalities.erase(candidateWorker.worker.address()); @@ -3217,7 +3217,7 @@ ACTOR Future storageRecruiter( DDTeamCollection* self, Referenceget().clusterInterface.recruitStorage.getReply( rsr, TaskDataDistribution ) ); + fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskPriority::DataDistribution ) ); } choose { @@ -3388,7 +3388,7 @@ ACTOR Future dataDistributionTeamCollection( ACTOR Future waitForDataDistributionEnabled( Database cx ) { state Transaction tr(cx); loop { - wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskDataDistribution)); + wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskPriority::DataDistribution)); try { Optional mode = wait( tr.get( dataDistributionModeKey ) ); @@ -3516,7 +3516,7 @@ ACTOR Future dataDistribution(Reference self) state double lastLimited = 0; self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) ); - state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, true, true); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); @@ -3646,7 +3646,7 @@ ACTOR Future dataDistribution(Reference self) } output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) ); } - wait( yield(TaskDataDistribution) ); + wait( yield(TaskPriority::DataDistribution) ); } vector tcis; diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index e155254850..d11fc63146 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -512,9 +512,9 @@ struct DDQueueData { // FIXME: is the merge case needed if( input.priority == PRIORITY_MERGE_SHARD ) { - wait( delay( 0.5, TaskDataDistribution - 2 ) ); + wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) ); } else { - wait( delay( 0.0001, TaskDataDistributionLaunch ) ); + wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) ); } loop { @@ -933,7 +933,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd .detail("Count", stuckCount) .detail("TeamCollectionId", tciIndex) .detail("NumOfTeamCollections", self->teamCollections.size()); - wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) ); + wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) ); } state std::vector destIds; @@ -993,7 +993,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd state Error error = success(); state Promise dataMovementComplete; state Future doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->teamCollections.size() > 1, relocateShardInterval.pairID ); - state Future pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch ); + state Future pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch ); try { loop { choose { @@ -1016,7 +1016,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd self->dataTransferComplete.send(rd); } } - pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch ); + pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch ); } when( wait( signalledTransferComplete ? Never() : dataMovementComplete.getFuture() ) ) { self->fetchKeysComplete.insert( rd ); @@ -1066,7 +1066,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } else { TEST(true); // move to removed server healthyDestinations.addDataInFlightToTeam( -metrics.bytes ); - wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskDataDistributionLaunch ) ); + wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch ) ); } } } catch (Error& e) { @@ -1125,7 +1125,7 @@ ACTOR Future BgDDMountainChopper( DDQueueData* self, int teamCollectionInd state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; loop { - wait( delay(checkDelay, TaskDataDistributionLaunch) ); + wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) ); if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { state Optional> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, true ) ) ) ); if( randomTeam.present() ) { @@ -1160,7 +1160,7 @@ ACTOR Future BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex) state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; loop { - wait( delay(checkDelay, TaskDataDistributionLaunch) ); + wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) ); if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { state Optional> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, false ) ) ) ); if( randomTeam.present() ) { @@ -1244,7 +1244,7 @@ ACTOR Future dataDistributionQueue( bool wasEmpty = serversToLaunchFrom.empty(); self.queueRelocation( rs, serversToLaunchFrom ); if(wasEmpty && !serversToLaunchFrom.empty()) - launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch); + launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch); } when ( wait(launchQueuedWorkTimeout) ) { self.launchQueuedWork( serversToLaunchFrom ); @@ -1258,7 +1258,7 @@ ACTOR Future dataDistributionQueue( when ( RelocateData done = waitNext( self.dataTransferComplete.getFuture() ) ) { complete( done, self.busymap ); if(serversToLaunchFrom.empty() && !done.src.empty()) - launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch); + launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch); serversToLaunchFrom.insert(done.src.begin(), done.src.end()); } when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) { @@ -1266,7 +1266,7 @@ ACTOR Future dataDistributionQueue( self.finishRelocation(done.priority); self.fetchKeysComplete.erase( done ); //self.logRelocation( done, "ShardRelocatorDone" ); - actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) ); + actors.add( tag( delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete ) ); if( g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60 ) { TraceEvent(SevWarnAlways, "RelocationDurationTooLong").detail("Duration", now() - done.startTime); debug_setCheckRelocationDuration(false); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index c4c8329754..ca4a849a33 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -140,7 +140,7 @@ ACTOR Future trackShardBytes( Reference>> shardSize, bool addToSizeEstimate = true) { - wait( delay( 0, TaskDataDistribution ) ); + wait( delay( 0, TaskPriority::DataDistribution ) ); /*TraceEvent("TrackShardBytesStarting") .detail("TrackerID", trackerID) @@ -260,7 +260,7 @@ ACTOR Future changeSizes( DataDistributionTracker* self, KeyRangeRef keys, } wait( waitForAll( sizes ) ); - wait( yield(TaskDataDistribution) ); + wait( yield(TaskPriority::DataDistribution) ); int64_t newShardsStartingSize = 0; for ( int i = 0; i < sizes.size(); i++ ) @@ -281,7 +281,7 @@ struct HasBeenTrueFor : NonCopyable { Future set() { if( !trigger.isValid() ) { cleared = Promise(); - trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskDataDistribution - 1 ) || cleared.getFuture(); + trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture(); } return trigger; } @@ -361,7 +361,7 @@ ACTOR Future shardSplitter( self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) ); } else { - wait( delay(1.0, TaskDataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers + wait( delay(1.0, TaskPriority::DataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers } return Void(); } @@ -529,7 +529,7 @@ ACTOR Future shardTracker( wait( yieldedFuture(self->maxShardSize->onChange()) ); // Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay - wait( delay( 0, TaskDataDistribution ) ); + wait( delay( 0, TaskPriority::DataDistribution ) ); /*TraceEvent("ShardTracker", self->distributorId) .detail("Begin", keys.begin) @@ -546,7 +546,7 @@ ACTOR Future shardTracker( // We could have a lot of actors being released from the previous wait at the same time. Immediately calling // delay(0) mitigates the resulting SlowTask - wait( delay(0, TaskDataDistribution) ); + wait( delay(0, TaskPriority::DataDistribution) ); } } catch (Error& e) { if (e.code() != error_code_actor_cancelled) @@ -593,12 +593,12 @@ ACTOR Future trackInitialShards(DataDistributionTracker *self, Referenceshards.size()-1; s++) { restartShardTrackers( self, KeyRangeRef( initData->shards[s].key, initData->shards[s+1].key ) ); - wait( yield( TaskDataDistribution ) ); + wait( yield( TaskPriority::DataDistribution ) ); } Future initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, allKeys.end), 0 ); diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index e53fa5a29a..7ce1a5c9b0 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1937,8 +1937,8 @@ KeyValueStoreSQLite::KeyValueStoreSQLite(std::string const& filename, UID id, Ke readCursors.resize(64); //< number of read threads sqlite3_soft_heap_limit64( SERVER_KNOBS->SOFT_HEAP_LIMIT ); // SOMEDAY: Is this a performance issue? Should we drop the cache sizes for individual threads? - int taskId = g_network->getCurrentTask(); - g_network->setCurrentTask(TaskDiskWrite); + TaskPriority taskId = g_network->getCurrentTask(); + g_network->setCurrentTask(TaskPriority::DiskWrite); writeThread->addThread( new Writer(filename, type==KeyValueStoreType::SSD_BTREE_V2, checkChecksums, checkIntegrity, writesComplete, springCleaningStats, diskBytesUsed, freeListPages, id, &readCursors) ); g_network->setCurrentTask(taskId); auto p = new Writer::InitAction(); @@ -1963,8 +1963,8 @@ StorageBytes KeyValueStoreSQLite::getStorageBytes() { void KeyValueStoreSQLite::startReadThreads() { int nReadThreads = readCursors.size(); - int taskId = g_network->getCurrentTask(); - g_network->setCurrentTask(TaskDiskRead); + TaskPriority taskId = g_network->getCurrentTask(); + g_network->setCurrentTask(TaskPriority::DiskRead); for(int i=0; iaddThread( new Reader(filename, type==KeyValueStoreType::SSD_BTREE_V2, readsComplete, logID, &readCursors[i]) ); g_network->setCurrentTask(taskId); diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp index 3cc50609d3..5a97b6358f 100644 --- a/fdbserver/LeaderElection.actor.cpp +++ b/fdbserver/LeaderElection.actor.cpp @@ -30,7 +30,7 @@ Optional> getLeader( const vector submitCandidacy( Key key, LeaderElectionRegInterface coord, LeaderInfo myInfo, UID prevChangeID, Reference>>> nominees, int index ) { loop { auto const& nom = nominees->get()[index]; - Optional li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskCoordinationReply ) ); + Optional li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskPriority::CoordinationReply ) ); if (li != nominees->get()[index]) { vector> v = nominees->get(); @@ -150,7 +150,7 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu // we might be breaking the leader election process for someone with better communications but lower ID, so change IDs. if ((!leader.present() || !leader.get().second) && std::count( nominees->get().begin(), nominees->get().end(), myInfo )) { if (!badCandidateTimeout.isValid()) - badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskCoordinationReply ); + badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskPriority::CoordinationReply ); } else badCandidateTimeout = Future(); @@ -183,12 +183,12 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu state vector> true_heartbeats; state vector> false_heartbeats; for(int i=0; i hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskCoordinationReply ); + Future hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskPriority::CoordinationReply ); true_heartbeats.push_back( onEqual(hb, true) ); false_heartbeats.push_back( onEqual(hb, false) ); } - state Future rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskCoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side? + state Future rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskPriority::CoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side? choose { when ( wait( quorum( true_heartbeats, true_heartbeats.size()/2+1 ) ) ) { diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 2dc8194d3a..eae38b50a7 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -51,7 +51,7 @@ struct LogRouterData { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, TaskPriority taskID ) { while(!self->version_messages.empty() && self->version_messages.front().first < before) { Version version = self->version_messages.front().first; int64_t messagesErased = 0; @@ -68,7 +68,7 @@ struct LogRouterData { return Void(); } - Future eraseMessagesBefore(Version before, LogRouterData *tlogData, int taskID) { + Future eraseMessagesBefore(Version before, LogRouterData *tlogData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, taskID); } }; @@ -197,7 +197,7 @@ ACTOR Future waitForVersion( LogRouterData *self, Version ver ) { while(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < ver) { if(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS > self->version.get()) { self->version.set( self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); } else { wait(self->minPopped.whenAtLeast((self->minPopped.get()+1))); } @@ -220,7 +220,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { loop { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { //FIXME: does this actually happen? @@ -247,7 +247,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { commitMessages(self, ver, messages); self->version.set( ver ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); //TraceEvent("LogRouterVersion").detail("Ver",ver); } lastVer = ver; @@ -260,7 +260,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { wait( waitForVersion(self, ver) ); self->version.set( ver ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); } break; } @@ -370,7 +370,7 @@ ACTOR Future logRouterPop( LogRouterData* self, TLogPopRequest req ) { } else if (req.to > tagData->popped) { tagData->popped = req.to; tagData->durableKnownCommittedVersion = req.durableKnownCommittedVersion; - wait(tagData->eraseMessagesBefore( req.to, self, TaskTLogPop )); + wait(tagData->eraseMessagesBefore( req.to, self, TaskPriority::TLogPop )); } state Version minPopped = std::numeric_limits::max(); @@ -384,7 +384,7 @@ ACTOR Future logRouterPop( LogRouterData* self, TLogPopRequest req ) { while(!self->messageBlocks.empty() && self->messageBlocks.front().first < minPopped) { self->messageBlocks.pop_front(); - wait(yield(TaskTLogPop)); + wait(yield(TaskPriority::TLogPop)); } self->poppedVersion = std::min(minKnownCommittedVersion, self->minKnownCommittedVersion); diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index dad3779938..a261354fbe 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -341,7 +341,7 @@ struct ILogSystem { //returns immediately if hasMessage() returns true. //returns when either the result of hasMessage() or version() has changed. - virtual Future getMore(int taskID = TaskTLogPeekReply) = 0; + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0; //returns when the failure monitor detects that the servers associated with the cursor are failed virtual Future onFailed() = 0; @@ -406,7 +406,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -454,7 +454,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -499,7 +499,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -533,7 +533,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -593,7 +593,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index ecf1877536..dee74c2dde 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -133,7 +133,7 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) { +ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) { if( !self->interf || self->messageVersion >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -192,7 +192,7 @@ ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self } } -ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) { +ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) { if( !self->interf || self->messageVersion >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -225,7 +225,7 @@ ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int ta } } -Future ILogSystem::ServerPeekCursor::getMore(int taskID) { +Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { //TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString()); if( hasMessage() ) return Void(); @@ -431,7 +431,7 @@ void ILogSystem::MergedPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, int taskID) { +ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) { loop { //TraceEvent("MPC_GetMoreA", self->randomID).detail("Start", startVersion.toString()); if(self->bestServer >= 0 && self->serverCursors[self->bestServer]->isActive()) { @@ -452,7 +452,7 @@ ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess } } -Future ILogSystem::MergedPeekCursor::getMore(int taskID) { +Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { if(!serverCursors.size()) return Never(); @@ -692,7 +692,7 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, int taskID) { +ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) { loop { //TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag); if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) { @@ -753,7 +753,7 @@ ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer } } -Future ILogSystem::SetPeekCursor::getMore(int taskID) { +Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { auto startVersion = version(); calcHasMessage(); if( hasMessage() ) @@ -848,7 +848,7 @@ void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) { cursors.back()->advanceTo(n); } -Future ILogSystem::MultiCursor::getMore(int taskID) { +Future ILogSystem::MultiCursor::getMore(TaskPriority taskID) { LogMessageVersion startVersion = cursors.back()->version(); while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) { poppedVersion = std::max(poppedVersion, cursors.back()->popped()); @@ -964,7 +964,7 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) { ASSERT(false); } -ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, int taskID ) { +ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, TaskPriority taskID ) { loop { wait(yield()); if(cursor->version().version >= maxVersion) { @@ -981,7 +981,7 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe } } -ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID ) { +ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriority taskID ) { if( self->messageVersion.version >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -1015,7 +1015,7 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID return Void(); } -Future ILogSystem::BufferedCursor::getMore(int taskID) { +Future ILogSystem::BufferedCursor::getMore(TaskPriority taskID) { if( hasMessage() ) return Void(); return bufferedGetMore(this, taskID); diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index 44674ec3bb..91a0d2444d 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -50,7 +50,7 @@ struct MasterInterface { } void initEndpoints() { - getCommitVersion.getEndpoint( TaskProxyGetConsistentReadVersion ); + getCommitVersion.getEndpoint( TaskPriority::ProxyGetConsistentReadVersion ); } }; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 3fc4665a15..57d2211fd8 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -158,7 +158,7 @@ ACTOR Future queueTransactionStartRequests( if (now() - *lastGRVTime > *GRVBatchTime) *lastGRVTime = now() - *GRVBatchTime; - forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskProxyGRVTimer)); + forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskPriority::ProxyGRVTimer)); } transactionQueue->push(std::make_pair(req, counter--)); @@ -263,7 +263,7 @@ struct ProxyCommitData { lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0), getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0), localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), - firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), db(db), + firstProxy(firstProxy), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true)), db(db), singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0) {} }; @@ -350,7 +350,7 @@ struct ResolutionRequestBuilder { }; ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStream, int> > out, FutureStream in, int desiredBytes, int64_t memBytesLimit) { - wait(delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher)); + wait(delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher)); state double lastBatch = 0; @@ -363,7 +363,7 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamMAX_COMMIT_BATCH_INTERVAL, TaskProxyCommitBatcher); + timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskPriority::ProxyCommitBatcher); } while(!timeout.isReady() && !(batch.size() == SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_COUNT_MAX || batchBytes >= desiredBytes)) { @@ -387,10 +387,10 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamcommitBatchStartNotifications.send(Void()); if(now() - lastBatch > commitData->commitBatchInterval) { - timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskProxyCommitBatcher); + timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskPriority::ProxyCommitBatcher); } else { - timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskProxyCommitBatcher); + timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskPriority::ProxyCommitBatcher); } } @@ -398,7 +398,7 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamcommitBatchStartNotifications.send(Void()); - timeout = delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher); + timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher); batch = std::vector(); batchBytes = 0; } @@ -457,7 +457,7 @@ ACTOR Future commitBatch( ASSERT(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS <= SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT); // since we are using just the former to limit the number of versions actually in flight! // Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches) so we need to downgrade here - wait(delay(0, TaskProxyCommit)); + wait(delay(0, TaskPriority::ProxyCommit)); self->lastVersionTime = t1; @@ -534,7 +534,7 @@ ACTOR Future commitBatch( vector< Future > replies; for (int r = 0; rresolvers.size(); r++) { requests.requests[r].debugID = debugID; - replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskProxyResolverReply))); + replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskPriority::ProxyResolverReply))); } state vector> transactionResolverMap = std::move( requests.transactionResolverMap ); @@ -1135,7 +1135,7 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi state vector> proxyVersions; for (auto const& p : *otherProxies) - proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskTLogConfirmRunningReply))); + proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::TLogConfirmRunningReply))); if (!(flags&GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)) { @@ -1292,7 +1292,7 @@ ACTOR static Future transactionStarter( } if (!transactionQueue.empty()) - forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskProxyGRVTimer)); + forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskPriority::ProxyGRVTimer)); /*TraceEvent("GRVBatch", proxy.id()) .detail("Elapsed", elapsed) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 4893f3c6a1..6a979e3cc5 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -130,12 +130,12 @@ ACTOR Future> addReadWriteDestinations(KeyRangeRef shard, vector> > srcChecks; for(int s=0; sSERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), srcInterfs[s].id(), 0 ) ); + srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), srcInterfs[s].id(), 0 ) ); } state vector< Future> > destChecks; for(int s=0; sSERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), destInterfs[s].id(), version ) ); + destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), destInterfs[s].id(), version ) ); } wait( waitForAll(srcChecks) && waitForAll(destChecks) ); @@ -225,7 +225,7 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve state TraceInterval interval("RelocateShard_StartMoveKeys"); //state TraceInterval waitInterval(""); - wait( startMoveKeysLock->take( TaskDataDistributionLaunch ) ); + wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) ); state FlowLock::Releaser releaser( *startMoveKeysLock ); TraceEvent(SevDebug, interval.begin(), relocationIntervalId); @@ -255,7 +255,7 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve //Keep track of shards for all src servers so that we can preserve their values in serverKeys state Map> shardMap; - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); wait( checkMoveKeysLock(&tr, lock) ); @@ -394,11 +394,11 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve ACTOR Future waitForShardReady( StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode ) { loop { try { - std::pair rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskMoveKeys ) ); + std::pair rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskPriority::MoveKeys ) ); if (rep.first >= minVersion) { return Void(); } - wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskMoveKeys ) ); + wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys ) ); } catch (Error& e) { if( e.code() != error_code_timed_out ) { @@ -419,7 +419,7 @@ ACTOR Future checkFetchingState( Database cx, vector dest, KeyRange k try { if (BUGGIFY) wait(delay(5)); - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); vector< Future< Optional > > serverListEntries; @@ -439,7 +439,7 @@ ACTOR Future checkFetchingState( Database cx, vector dest, KeyRange k } wait( timeoutError( waitForAll( requests ), - SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskMoveKeys ) ); + SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys ) ); dataMovementComplete.send(Void()); return Void(); @@ -480,11 +480,11 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest //printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { try { - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); releaser.release(); - wait( finishMoveKeysParallelismLock->take( TaskDataDistributionLaunch ) ); + wait( finishMoveKeysParallelismLock->take( TaskPriority::DataDistributionLaunch ) ); releaser = FlowLock::Releaser( *finishMoveKeysParallelismLock ); wait( checkMoveKeysLock(&tr, lock) ); @@ -632,7 +632,7 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest for(int s=0; sSERVER_READY_QUORUM_TIMEOUT, Void(), TaskMoveKeys ) ); + wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskPriority::MoveKeys ) ); int count = dest.size() - newDestinations.size(); for(int s=0; s removeStorageServer( Database cx, UID serverID, MoveKeysLock if (!canRemove) { TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to reverse its mistake. TraceEvent(SevWarn,"NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID); - wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskDataDistributionLaunch) ); + wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch) ); tr.reset(); TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove); } else { diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index fd2be1f08f..bd8db636a1 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -333,7 +333,7 @@ namespace oldTLog_4_6 { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference tlogData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference tlogData, TaskPriority taskID ) { while(!self->version_messages.empty() && self->version_messages.front().first < before) { Version version = self->version_messages.front().first; std::pair &sizes = tlogData->version_sizes[version]; @@ -359,7 +359,7 @@ namespace oldTLog_4_6 { return Void(); } - Future eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference tlogData, int taskID) { + Future eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference tlogData, TaskPriority taskID) { return eraseMessagesBefore(this, before, gBytesErased, tlogData, taskID); } }; @@ -526,21 +526,21 @@ namespace oldTLog_4_6 { self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tag->key, currentVersion ), wr.toValue() ) ); - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); } } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } self->persistentData->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(newPersistentDataVersion, Unversioned()) ) ); logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -548,20 +548,20 @@ namespace oldTLog_4_6 { logData->persistentDataDurableVersion = newPersistentDataVersion; for(tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) { - wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -586,7 +586,7 @@ namespace oldTLog_4_6 { } if(!self->queueOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -621,14 +621,14 @@ namespace oldTLog_4_6 { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { self->updatePersist = updatePersistentData(self, logData, nextVersion); wait( self->updatePersist ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -639,9 +639,9 @@ namespace oldTLog_4_6 { if(logData->persistentDataDurableVersion == logData->version.get()) { self->queueOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -650,7 +650,7 @@ namespace oldTLog_4_6 { while( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end() && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) ) { - wait( yield(TaskUpdateStorage) ); + wait( yield(TaskPriority::UpdateStorage) ); ++sizeItr; nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key; @@ -662,7 +662,7 @@ namespace oldTLog_4_6 { totalSize += it->second.expectedSize(); } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } prevVersion = nextVersion; @@ -673,7 +673,7 @@ namespace oldTLog_4_6 { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { self->updatePersist = updatePersistentData(self, logData, nextVersion); @@ -681,21 +681,21 @@ namespace oldTLog_4_6 { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -823,7 +823,7 @@ namespace oldTLog_4_6 { ti->value.popped_recently = true; //if (to.epoch == self->epoch()) if ( req.to > logData->persistentDataDurableVersion ) - wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskTLogPop )); + wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskPriority::TLogPop )); } req.reply.send(Void()); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index fc9251ec78..c9837d6814 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -297,7 +297,7 @@ struct TLogData : NonCopyable { concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() { - cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); } }; @@ -323,7 +323,7 @@ struct LogData : NonCopyable, public ReferenceCounted { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, TaskPriority taskID ) { while(!self->versionMessages.empty() && self->versionMessages.front().first < before) { Version version = self->versionMessages.front().first; std::pair &sizes = logData->version_sizes[version]; @@ -352,7 +352,7 @@ struct LogData : NonCopyable, public ReferenceCounted { return Void(); } - Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, int taskID) { + Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, logData, taskID); } }; @@ -607,14 +607,14 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tagData->tag, currentVersion ), wr.toValue() ) ); - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); } } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } } } @@ -624,7 +624,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -634,22 +634,22 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) { for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { if(logData->tag_data[tagLocality][tagId]) { - wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } } } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -674,7 +674,7 @@ ACTOR Future updateStorage( TLogData* self ) { } if(!self->queueOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -698,7 +698,7 @@ ACTOR Future updateStorage( TLogData* self ) { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { @@ -707,7 +707,7 @@ ACTOR Future updateStorage( TLogData* self ) { wait( updatePersistentData(self, logData, nextVersion) ); commitLockReleaser.release(); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -718,9 +718,9 @@ ACTOR Future updateStorage( TLogData* self ) { if(logData->persistentDataDurableVersion == logData->version.get()) { self->queueOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -741,7 +741,7 @@ ACTOR Future updateStorage( TLogData* self ) { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { wait( self->persistentDataCommitLock.take() ); @@ -751,21 +751,21 @@ ACTOR Future updateStorage( TLogData* self ) { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -943,7 +943,7 @@ ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere } if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop)); + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); } return Void(); @@ -1059,7 +1059,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if( req.tag.locality == tagLocalityLogRouter ) { wait( self->concurrentLogRouterReads.take() ); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait( delay(0.0, TaskLowPriority) ); + wait( delay(0.0, TaskPriority::Low) ); } if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { @@ -1068,7 +1068,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere // slightly faster over keeping the rest of the cluster operating normally. // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests // that impact recovery duration. - wait(delay(0, TaskTLogSpilledPeekReply)); + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } Version poppedVer = poppedVersion(logData, req.tag); @@ -1173,7 +1173,7 @@ ACTOR Future watchDegraded(TLogData* self) { //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask state int loopCount = 0; while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { - wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low)); loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); @@ -1509,7 +1509,7 @@ ACTOR Future tLogCommit( .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } // while exec op is being committed, no new transactions will be admitted. @@ -1849,7 +1849,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st while (!endVersion.present() || logData->version.get() < endVersion.get()) { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { @@ -1872,7 +1872,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } state Version ver = 0; @@ -1912,7 +1912,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } lastVer = ver; ver = r->version().version; @@ -1949,7 +1949,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } break; } diff --git a/fdbserver/Orderer.actor.h b/fdbserver/Orderer.actor.h index cd9d3d5a19..71f970ce45 100644 --- a/fdbserver/Orderer.actor.h +++ b/fdbserver/Orderer.actor.h @@ -38,7 +38,7 @@ public: ready = NotifiedVersion(s); started = false; } - Future order( Seq s, int taskID = TaskDefaultYield ) { + Future order( Seq s, TaskPriority taskID = TaskPriority::DefaultYield ) { if ( ready.get() < s ) return waitAndOrder( this, s, taskID ); else @@ -54,7 +54,7 @@ public: return ready.whenAtLeast(v); } private: - ACTOR static Future waitAndOrder( Orderer* self, Seq s, int taskID ) { + ACTOR static Future waitAndOrder( Orderer* self, Seq s, TaskPriority taskID ) { wait( self->ready.whenAtLeast(s) ); wait( yield( taskID ) || self->shutdown.getFuture() ); return self->dedup(s); diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 9813592fc9..6de6f31f82 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -300,7 +300,7 @@ ACTOR Future trackEachStorageServer( ACTOR Future monitorServerListChange( Reference> dbInfo, PromiseStream< std::pair> > serverChanges) { - state Database db = openDBOnServer(dbInfo, TaskRatekeeper, true, true); + state Database db = openDBOnServer(dbInfo, TaskPriority::Ratekeeper, true, true); state std::map oldServers; state Transaction tr(db); @@ -629,7 +629,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { } ACTOR Future configurationMonitor(Reference> dbInfo, DatabaseConfiguration* conf) { - state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index db49433692..41834bb163 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -114,9 +114,9 @@ ACTOR Future resolveBatch( } } - if (check_yield(TaskDefaultEndpoint)) { - wait( delay( 0, TaskLowPriority ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right? - g_network->setCurrentTask(TaskDefaultEndpoint); + if (check_yield(TaskPriority::DefaultEndpoint)) { + wait( delay( 0, TaskPriority::Low ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right? + g_network->setCurrentTask(TaskPriority::DefaultEndpoint); } if (self->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!) diff --git a/fdbserver/ResolverInterface.h b/fdbserver/ResolverInterface.h index 2bb808d84b..65b46a5941 100644 --- a/fdbserver/ResolverInterface.h +++ b/fdbserver/ResolverInterface.h @@ -44,8 +44,8 @@ struct ResolverInterface { bool operator != ( ResolverInterface const& r ) const { return id() != r.id(); } NetworkAddress address() const { return resolve.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - metrics.getEndpoint( TaskResolutionMetrics ); - split.getEndpoint( TaskResolutionMetrics ); + metrics.getEndpoint( TaskPriority::ResolutionMetrics ); + split.getEndpoint( TaskPriority::ResolutionMetrics ); } template diff --git a/fdbserver/RestoreInterface.h b/fdbserver/RestoreInterface.h index 847e940d9b..670946d9ab 100644 --- a/fdbserver/RestoreInterface.h +++ b/fdbserver/RestoreInterface.h @@ -37,7 +37,7 @@ struct RestoreInterface { NetworkAddress address() const { return test.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - test.getEndpoint( TaskClusterController ); + test.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 81330eac10..95e14136f9 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -215,7 +215,7 @@ ACTOR Future simulatedFDBDRebooter(Referencec_str(), coordFolder->c_str()); wait(g_simulator.onProcess(process, - TaskDefaultYield)); // Now switch execution to the process on which we will run + TaskPriority::DefaultYield)); // Now switch execution to the process on which we will run state Future onShutdown = process->onShutdown(); try { @@ -1399,7 +1399,7 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot Standalone(deterministicRandom()->randomUniqueID().toString()), Optional>()), ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource), "", ""), - TaskDefaultYield)); + TaskPriority::DefaultYield)); Sim2FileSystem::newFileSystem(); FlowTransport::createInstance(true, 1); if (tlsOptions->enabled()) { diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 47c61aeb9f..e4c31fdf3d 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1809,7 +1809,7 @@ ACTOR Future layerStatusFetcher(Database cx, JsonBuilderArray ACTOR Future lockedStatusFetcher(Reference> db, JsonBuilderArray *messages, std::set *incomplete_reasons) { state JsonBuilderObject statusObj; - state Database cx = openDBOnServer(db, TaskDefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware + state Database cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware state Transaction tr(cx); state int timeoutSeconds = 5; state Future getTimeout = delay(timeoutSeconds); diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index aa0c8a622e..641843f7fe 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -56,11 +56,11 @@ struct TLogInterface { bool operator == ( TLogInterface const& r ) const { return id() == r.id(); } NetworkAddress address() const { return peekMessages.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - getQueuingMetrics.getEndpoint( TaskTLogQueuingMetrics ); - popMessages.getEndpoint( TaskTLogPop ); - peekMessages.getEndpoint( TaskTLogPeek ); - confirmRunning.getEndpoint( TaskTLogConfirmRunning ); - commit.getEndpoint( TaskTLogCommit ); + getQueuingMetrics.getEndpoint( TaskPriority::TLogQueuingMetrics ); + popMessages.getEndpoint( TaskPriority::TLogPop ); + peekMessages.getEndpoint( TaskPriority::TLogPeek ); + confirmRunning.getEndpoint( TaskPriority::TLogConfirmRunning ); + commit.getEndpoint( TaskPriority::TLogCommit ); } template diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 52d0079ab7..98dab7d489 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -349,7 +349,7 @@ struct TLogData : NonCopyable { concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() { - cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); } }; @@ -379,7 +379,7 @@ struct LogData : NonCopyable, public ReferenceCounted { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, TaskPriority taskID ) { while(!self->versionMessages.empty() && self->versionMessages.front().first < before) { Version version = self->versionMessages.front().first; std::pair &sizes = logData->version_sizes[version]; @@ -408,7 +408,7 @@ struct LogData : NonCopyable, public ReferenceCounted { return Void(); } - Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, int taskID) { + Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, logData, taskID); } }; @@ -766,7 +766,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { state Reference tagData = logData->tag_data[tagLocality][tagId]; if(tagData) { - wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskUpdateStorage )); + wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskPriority::UpdateStorage )); state Version currentVersion = 0; // Clear recently popped versions from persistentData if necessary updatePersistentPopped( self, logData, tagData ); @@ -819,7 +819,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD wr << uint32_t(0); } - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); @@ -832,7 +832,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD tagData->poppedLocation = std::min(tagData->poppedLocation, firstLocation); } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } } } @@ -847,7 +847,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -857,22 +857,22 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) { for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { if(logData->tag_data[tagLocality][tagId]) { - wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } } } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -915,7 +915,7 @@ ACTOR Future updateStorage( TLogData* self ) { } if(!self->spillOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -940,7 +940,7 @@ ACTOR Future updateStorage( TLogData* self ) { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { @@ -953,7 +953,7 @@ ACTOR Future updateStorage( TLogData* self ) { } commitLockReleaser.release(); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -964,9 +964,9 @@ ACTOR Future updateStorage( TLogData* self ) { if(logData->persistentDataDurableVersion == logData->version.get()) { self->spillOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -988,7 +988,7 @@ ACTOR Future updateStorage( TLogData* self ) { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { wait( self->persistentDataCommitLock.take() ); @@ -1001,21 +1001,21 @@ ACTOR Future updateStorage( TLogData* self ) { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -1194,7 +1194,7 @@ ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere } if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop)); + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); } return Void(); @@ -1346,7 +1346,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if( req.tag.locality == tagLocalityLogRouter ) { wait( self->concurrentLogRouterReads.take() ); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait( delay(0.0, TaskLowPriority) ); + wait( delay(0.0, TaskPriority::Low) ); } if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { @@ -1355,7 +1355,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere // slightly faster over keeping the rest of the cluster operating normally. // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests // that impact recovery duration. - wait(delay(0, TaskTLogSpilledPeekReply)); + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } Version poppedVer = poppedVersion(logData, req.tag); @@ -1456,7 +1456,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if (earlyEnd) break; } earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK+1); - wait( self->peekMemoryLimiter.take(TaskTLogSpilledPeekReply, commitBytes) ); + wait( self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes) ); state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); state std::vector>> messageReads; messageReads.reserve( commitLocations.size() ); @@ -1540,7 +1540,7 @@ ACTOR Future watchDegraded(TLogData* self) { //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask state int loopCount = 0; while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { - wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low)); loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); @@ -1876,7 +1876,7 @@ ACTOR Future tLogCommit( .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } // while exec op is being committed, no new transactions will be admitted. @@ -2223,7 +2223,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st while (!endVersion.present() || logData->version.get() < endVersion.get()) { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { @@ -2246,7 +2246,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } state Version ver = 0; @@ -2286,7 +2286,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } lastVer = ver; ver = r->version().version; @@ -2323,7 +2323,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } break; } diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2e25daae3b..d9ba5637b2 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -431,7 +431,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> tLogCommitResults; for(int loc=0; loc< it->logServers.size(); loc++) { Standalone msg = data.getMessages(location); - allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskTLogCommitReply ) ); + allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskPriority::TLogCommitReply ) ); Future commitSuccess = success(allReplies.back()); addActor.get().send(commitSuccess); tLogCommitResults.push_back(commitSuccess); @@ -961,7 +961,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedget().present() ) { alive.push_back( brokenPromiseToNever( t->get().interf().confirmRunning.getReply( TLogConfirmRunningRequest(debugID), - TaskTLogConfirmRunningReply ) ) ); + TaskPriority::TLogConfirmRunningReply ) ) ); numPresent++; } else { alive.push_back( Never() ); diff --git a/fdbserver/VFSAsync.cpp b/fdbserver/VFSAsync.cpp index 95e6b958a4..3d53aaccfb 100644 --- a/fdbserver/VFSAsync.cpp +++ b/fdbserver/VFSAsync.cpp @@ -713,7 +713,7 @@ static int asyncSleep(sqlite3_vfs *pVfs, int microseconds){ waitFor( delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY) ); return 0; } - waitFor( g_network->delay( microseconds*1e-6, TaskDefaultDelay ) || simCancel ); + waitFor( g_network->delay( microseconds*1e-6, TaskPriority::DefaultDelay ) || simCancel ); return microseconds; } catch( Error &e ) { TraceEvent(SevError, "AsyncSleepError").error(e,true); diff --git a/fdbserver/WaitFailure.actor.cpp b/fdbserver/WaitFailure.actor.cpp index 778128f830..6ab6efeb74 100644 --- a/fdbserver/WaitFailure.actor.cpp +++ b/fdbserver/WaitFailure.actor.cpp @@ -37,7 +37,7 @@ ACTOR Future waitFailureServer(FutureStream> waitFailur } } -ACTOR Future waitFailureClient(RequestStream> waitFailure, double reactionTime, double reactionSlope, int taskID){ +ACTOR Future waitFailureClient(RequestStream> waitFailure, double reactionTime, double reactionSlope, TaskPriority taskID){ loop { try { state double start = now(); @@ -55,7 +55,7 @@ ACTOR Future waitFailureClient(RequestStream> waitFailu } } -ACTOR Future waitFailureClientStrict(RequestStream> waitFailure, double failureReactionTime, int taskID){ +ACTOR Future waitFailureClientStrict(RequestStream> waitFailure, double failureReactionTime, TaskPriority taskID){ loop { wait(waitFailureClient(waitFailure, 0, 0, taskID)); wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false))); @@ -65,7 +65,7 @@ ACTOR Future waitFailureClientStrict(RequestStream> wai } } -ACTOR Future waitFailureTracker(RequestStream> waitFailure, Reference> failed, double reactionTime, double reactionSlope, int taskID){ +ACTOR Future waitFailureTracker(RequestStream> waitFailure, Reference> failed, double reactionTime, double reactionSlope, TaskPriority taskID){ loop { try { failed->set( IFailureMonitor::failureMonitor().getState(waitFailure.getEndpoint()).isFailed() ); diff --git a/fdbserver/WaitFailure.h b/fdbserver/WaitFailure.h index 9ef3b4c3a0..413dc9a56a 100644 --- a/fdbserver/WaitFailure.h +++ b/fdbserver/WaitFailure.h @@ -26,13 +26,13 @@ Future waitFailureServer(const FutureStream>& waitFailu // talks to a wait failure server, returns Void on failure Future waitFailureClient(const RequestStream>& waitFailure, - double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); + double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); // talks to a wait failure server, returns Void on failure, reaction time is always waited -Future waitFailureClientStrict(const RequestStream>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint); +Future waitFailureClientStrict(const RequestStream>& waitFailure, double const& failureReactionTime=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); // talks to a wait failure server, updates failed to be true or false based on failure status. Future waitFailureTracker(const RequestStream>& waitFailure, Reference> const& failed, - double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); + double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); -#endif \ No newline at end of file +#endif diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index 8370e7fdde..ffd373194c 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -392,7 +392,7 @@ void endRole(const Role &role, UID id, std::string reason, bool ok = true, Error struct ServerDBInfo; -class Database openDBOnServer( Reference> const& db, int taskID = TaskDefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); +class Database openDBOnServer( Reference> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); ACTOR Future extractClusterInterface(Reference>> a, Reference>> b); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 4d7f58796e..1d785b5d22 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -493,7 +493,7 @@ Future startSystemMonitor(std::string dataFolder, OptionalgetLocalAddress().ip)); systemMonitor(); - return recurring( &systemMonitor, 5.0, TaskFlushTrace ); + return recurring( &systemMonitor, 5.0, TaskPriority::FlushTrace ); } void testIndexedSet(); diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 205b1dbc19..84365d455f 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -464,7 +464,7 @@ Future sendMasterRegistration( MasterData* self, LogSystemConfig const& lo } ACTOR Future updateRegistration( Reference self, Reference logSystem ) { - state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true); state Future trigger = self->registrationTrigger.onTrigger(); state Future updateLogsKey; @@ -1017,12 +1017,12 @@ ACTOR Future resolutionBalancing(Reference self) { state CoalescedKeyRangeMap key_resolver; key_resolver.insert(allKeys, 0); loop { - wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskResolutionMetrics)); + wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics)); while(self->resolverChanges.get().size()) wait(self->resolverChanges.onChange()); state std::vector> futures; for (auto& p : self->resolvers) - futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskResolutionMetrics))); + futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics))); wait( waitForAll(futures) ); state IndexedSet, NoMetric> metrics; @@ -1047,7 +1047,7 @@ ACTOR Future resolutionBalancing(Reference self) { req.offset = amount; req.range = range.first; - ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskResolutionMetrics)) ); + ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskPriority::ResolutionMetrics)) ); KeyRangeRef moveRange = range.second ? KeyRangeRef( range.first.begin, split.key ) : KeyRangeRef( split.key, range.first.end ); movedRanges.push_back_deep(movedRanges.arena(), ResolverMoveRef(moveRange, dest)); TraceEvent("MovingResolutionRange").detail("Src", src).detail("Dest", dest).detail("Amount", amount).detail("StartRange", range.first).detail("MoveRange", moveRange).detail("Used", split.used).detail("KeyResolverRanges", key_resolver.size()); @@ -1185,7 +1185,7 @@ ACTOR Future trackTlogRecovery( Reference self, Reference configurationMonitor( Reference self ) { - state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp index 61bf80ed55..795dd769c5 100644 --- a/fdbserver/networktest.actor.cpp +++ b/fdbserver/networktest.actor.cpp @@ -30,7 +30,7 @@ NetworkTestInterface::NetworkTestInterface( NetworkAddress remote ) NetworkTestInterface::NetworkTestInterface( INetwork* local ) { - test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskDefaultEndpoint ); + test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskPriority::DefaultEndpoint ); } ACTOR Future networkTestServer() { diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 28d47fa9cb..090f3f5df5 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -550,7 +550,7 @@ public: newestDirtyVersion.insert(allKeys, invalidVersion); addShard( ShardInfo::newNotAssigned( allKeys ) ); - cx = openDBOnServer(db, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true); } //~StorageServer() { fclose(log); } @@ -828,7 +828,7 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); @@ -1345,7 +1345,7 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); try { if( req.debugID.present() ) @@ -1458,7 +1458,7 @@ ACTOR Future getKey( StorageServer* data, GetKeyRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); try { state Version version = wait( waitForVersion( data, req.version ) ); @@ -2003,7 +2003,7 @@ ACTOR Future fetchKeys( StorageServer *data, AddingShard* shard ) { TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID); - wait( data->fetchKeysParallelismLock.take( TaskDefaultYield, fetchBlockBytes ) ); + wait( data->fetchKeysParallelismLock.take( TaskPriority::DefaultYield, fetchBlockBytes ) ); state FlowLock::Releaser holdingFKPL( data->fetchKeysParallelismLock, fetchBlockBytes ); state double executeStart = now(); @@ -2590,7 +2590,7 @@ ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) } data->behind = true; - wait( delayJittered(.005, TaskTLogPeekReply) ); + wait( delayJittered(.005, TaskPriority::TLogPeekReply) ); } while( data->byteSampleClearsTooLarge.get() ) { @@ -2617,7 +2617,7 @@ ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) *pReceivedUpdate = true; start = now(); - wait( data->durableVersionLock.take(TaskTLogPeekReply,1) ); + wait( data->durableVersionLock.take(TaskPriority::TLogPeekReply,1) ); state FlowLock::Releaser holdingDVL( data->durableVersionLock ); if(now() - start > 0.1) TraceEvent("SSSlowTakeLock1", data->thisServerID).detailf("From", "%016llx", debug_lastLoadBalanceResultEndpointToken).detail("Duration", now() - start).detail("Version", data->version.get()); @@ -2865,11 +2865,11 @@ ACTOR Future updateStorage(StorageServer* data) { if (g_network->isSimulated()) { double endTime = g_simulator.checkDisabled(format("%s/updateStorage", data->thisServerID.toString().c_str())); if(endTime > now()) { - wait(delay(endTime - now(), TaskUpdateStorage)); + wait(delay(endTime - now(), TaskPriority::UpdateStorage)); } } wait( data->desiredOldestVersion.whenAtLeast( data->storageVersion()+1 ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); state Promise durableInProgress; data->durableInProgress = durableInProgress.getFuture(); @@ -2882,10 +2882,10 @@ ACTOR Future updateStorage(StorageServer* data) { state bool done = data->storage.makeVersionMutationsDurable(newOldestVersion, desiredVersion, bytesLeft); // We want to forget things from these data structures atomically with changing oldestVersion (and "before", since oldestVersion.set() may trigger waiting actors) // forgetVersionsBeforeAsync visibly forgets immediately (without waiting) but asynchronously frees memory. - Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskUpdateStorage ); + Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskPriority::UpdateStorage ); data->oldestVersion.set( newOldestVersion ); wait( finishedForgetting ); - wait( yield(TaskUpdateStorage) ); + wait( yield(TaskPriority::UpdateStorage) ); if (done) break; } @@ -2916,7 +2916,7 @@ ACTOR Future updateStorage(StorageServer* data) { } durableInProgress.send(Void()); - wait( delay(0, TaskUpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation + wait( delay(0, TaskPriority::UpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation // Taking and releasing the durableVersionLock ensures that no eager reads both begin before the commit was effective and // are applied after we change the durable version. Also ensure that we have to lock while calling changeDurableVersion, @@ -2925,9 +2925,9 @@ ACTOR Future updateStorage(StorageServer* data) { data->popVersion( data->durableVersion.get() + 1 ); while (!changeDurableVersion( data, newOldestVersion )) { - if(g_network->check_yield(TaskUpdateStorage)) { + if(g_network->check_yield(TaskPriority::UpdateStorage)) { data->durableVersionLock.release(); - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); wait( data->durableVersionLock.take() ); } } @@ -3537,7 +3537,7 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac } } when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.recieved"); //.detail("TaskID", g_network->getCurrentTask()); @@ -3552,11 +3552,11 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac actors.add(self->readGuard(req, watchValueQ)); } when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work actors.add(self->readGuard(req , getKey)); } when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work actors.add(self->readGuard(req , getKeyValues)); } when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 7205da04c4..b8fc706128 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -75,7 +75,7 @@ ACTOR static Future extractClientInfo( Reference> d } } -Database openDBOnServer( Reference> const& db, int taskID, bool enableLocalityLoadBalance, bool lockAware ) { +Database openDBOnServer( Reference> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) { Reference> info( new AsyncVar ); return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware ); } @@ -737,7 +737,7 @@ ACTOR Future workerServer( } } else { bool lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff'; - metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskDefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) ); + metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskPriority::DefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) ); } } @@ -1169,7 +1169,7 @@ ACTOR Future workerServer( } when( wait( loggingTrigger ) ) { systemMonitor(); - loggingTrigger = delay( loggingDelay, TaskFlushTrace ); + loggingTrigger = delay( loggingDelay, TaskPriority::FlushTrace ); } when(state ExecuteRequest req = waitNext(interf.execReq.getFuture())) { state ExecCmdValueString execArg(req.execPayload); diff --git a/flow/IThreadPool.h b/flow/IThreadPool.h index 5da60d2930..c5be41f87a 100644 --- a/flow/IThreadPool.h +++ b/flow/IThreadPool.h @@ -92,12 +92,12 @@ public: void send( T const& t ) { // Can be called safely from another thread. Call send or sendError at most once. Promise signal; tagAndForward( &promise, t, signal.getFuture() ); - g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 ); + g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) ); } void sendError( Error const& e ) { // Can be called safely from another thread. Call send or sendError at most once. Promise signal; tagAndForwardError( &promise, e, signal.getFuture() ); - g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 ); + g_network->onMainThread( std::move(signal), incrementPriority( g_network->getCurrentTask() ) ); } private: Promise promise; @@ -106,4 +106,4 @@ private: Reference createGenericThreadPool(); -#endif \ No newline at end of file +#endif diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 2dcf9783ed..0c3db011ed 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -100,9 +100,9 @@ public: struct OrderedTask { int64_t priority; - int taskID; + TaskPriority taskID; Task *task; - OrderedTask(int64_t priority, int taskID, Task* task) : priority(priority), taskID(taskID), task(task) {} + OrderedTask(int64_t priority, TaskPriority taskID, Task* task) : priority(priority), taskID(taskID), task(task) {} bool operator < (OrderedTask const& rhs) const { return priority < rhs.priority; } }; @@ -122,12 +122,12 @@ public: // INetwork interface virtual double now() { return currentTime; }; - virtual Future delay( double seconds, int taskId ); - virtual Future yield( int taskID ); - virtual bool check_yield(int taskId); - virtual int getCurrentTask() { return currentTaskID; } - virtual void setCurrentTask(int taskID ) { priorityMetric = currentTaskID = taskID; } - virtual void onMainThread( Promise&& signal, int taskID ); + virtual Future delay( double seconds, TaskPriority taskId ); + virtual Future yield( TaskPriority taskID ); + virtual bool check_yield(TaskPriority taskId); + virtual TaskPriority getCurrentTask() { return currentTaskID; } + virtual void setCurrentTask(TaskPriority taskID ) { currentTaskID = taskID; priorityMetric = (int64_t)taskID; } + virtual void onMainThread( Promise&& signal, TaskPriority taskID ); virtual void stop() { if ( thread_network == this ) stopImmediately(); @@ -157,7 +157,7 @@ public: int64_t tsc_begin, tsc_end; double taskBegin; - int currentTaskID; + TaskPriority currentTaskID; uint64_t tasksIssued; TDMetricCollection tdmetrics; double currentTime; @@ -167,7 +167,7 @@ public: uint64_t numYields; double lastPriorityTrackTime; - int lastMinTaskID; + TaskPriority lastMinTaskID; double priorityTimer[NetworkMetrics::PRIORITY_BINS]; std::priority_queue> ready; @@ -175,15 +175,15 @@ public: struct DelayedTask : OrderedTask { double at; - DelayedTask(double at, int64_t priority, int taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {} + DelayedTask(double at, int64_t priority, TaskPriority taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {} bool operator < (DelayedTask const& rhs) const { return at > rhs.at; } // Ordering is reversed for priority_queue }; std::priority_queue> timers; - void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority); - bool check_yield(int taskId, bool isRunLoop); + void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority); + bool check_yield(TaskPriority taskId, bool isRunLoop); void processThreadReady(); - void trackMinPriority( int minTaskID, double now ); + void trackMinPriority( TaskPriority minTaskID, double now ); void stopImmediately() { stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2); } @@ -489,8 +489,8 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer) stopped(false), tasksIssued(0), // Until run() is called, yield() will always yield - tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskDefaultYield), - lastMinTaskID(0), + tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield), + lastMinTaskID(TaskPriority::Zero), numYields(0) { TraceEvent("Net2Starting"); @@ -511,7 +511,7 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer) int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 }; static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins"); for(int i=0; i(priBins[i]); updateNow(); } @@ -579,7 +579,7 @@ void Net2::run() { tsc_begin = __rdtsc(); taskBegin = timer_monotonic(); runFunc(); - checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskRunCycleFunction); + checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskPriority::RunCycleFunction); } double sleepTime = 0; @@ -607,7 +607,7 @@ void Net2::run() { if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow); - if (sleepTime) trackMinPriority( 0, now ); + if (sleepTime) trackMinPriority( TaskPriority::Zero, now ); while (!timers.empty() && timers.top().at < now) { ++countTimers; ready.push( timers.top() ); @@ -620,12 +620,12 @@ void Net2::run() { tsc_end = tsc_begin + FLOW_KNOBS->TSC_YIELD_TIME; taskBegin = timer_monotonic(); numYields = 0; - int minTaskID = TaskMaxPriority; + TaskPriority minTaskID = TaskPriority::Max; while (!ready.empty()) { ++countTasks; currentTaskID = ready.top().taskID; - priorityMetric = currentTaskID; + priorityMetric = static_cast(currentTaskID); minTaskID = std::min(minTaskID, currentTaskID); Task* task = ready.top().task; ready.pop(); @@ -638,7 +638,7 @@ void Net2::run() { TraceEvent(SevError, "TaskError").error(unknown_error()); } - if (check_yield(TaskMaxPriority, true)) { ++countYields; break; } + if (check_yield(TaskPriority::Max, true)) { ++countYields; break; } } nnow = timer_monotonic(); @@ -697,10 +697,10 @@ void Net2::run() { #endif } -void Net2::trackMinPriority( int minTaskID, double now ) { +void Net2::trackMinPriority( TaskPriority minTaskID, double now ) { if (minTaskID != lastMinTaskID) for(int c=0; c= minTaskID && pri < lastMinTaskID) { // busy -> idle double busyFor = lastPriorityTrackTime - priorityTimer[c]; networkMetrics.secSquaredPriorityBlocked[c] += busyFor*busyFor; @@ -723,7 +723,7 @@ void Net2::processThreadReady() { } } -void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority) { +void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority) { int64_t elapsed = tscEnd-tscBegin; if (elapsed > FLOW_KNOBS->TSC_YIELD_TIME && tscBegin > 0) { int i = std::min(NetworkMetrics::SLOW_EVENT_BINS-1, log( elapsed/1e6 ) / log(2.)); @@ -734,7 +734,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i slowTaskMetric->clocks = elapsed; slowTaskMetric->duration = (int64_t)(duration*1e9); - slowTaskMetric->priority = priority; + slowTaskMetric->priority = static_cast(priority); slowTaskMetric->numYields = numYields; slowTaskMetric->log(); @@ -748,7 +748,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i } } -bool Net2::check_yield( int taskID, bool isRunLoop ) { +bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) { if(!isRunLoop && numYields > 0) { ++numYields; return true; @@ -761,8 +761,8 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) { processThreadReady(); - if (taskID == TaskDefaultYield) taskID = currentTaskID; - if (!ready.empty() && ready.top().priority > (int64_t(taskID)<<32)) { + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; + if (!ready.empty() && ready.top().priority > int64_t(taskID)<<32) { return true; } @@ -787,13 +787,13 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) { return false; } -bool Net2::check_yield( int taskID ) { +bool Net2::check_yield( TaskPriority taskID ) { return check_yield(taskID, false); } -Future Net2::yield( int taskID ) { +Future Net2::yield( TaskPriority taskID ) { ++countYieldCalls; - if (taskID == TaskDefaultYield) taskID = currentTaskID; + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (check_yield(taskID, false)) { ++countYieldCallsTrue; return delay(0, taskID); @@ -802,7 +802,7 @@ Future Net2::yield( int taskID ) { return Void(); } -Future Net2::delay( double seconds, int taskId ) { +Future Net2::delay( double seconds, TaskPriority taskId ) { if (seconds <= 0.) { PromiseTask* t = new PromiseTask; this->ready.push( OrderedTask( (int64_t(taskId)<<32)-(++tasksIssued), taskId, t) ); @@ -817,7 +817,7 @@ Future Net2::delay( double seconds, int taskId ) { return t->promise.getFuture(); } -void Net2::onMainThread(Promise&& signal, int taskID) { +void Net2::onMainThread(Promise&& signal, TaskPriority taskID) { if (stopped) return; PromiseTask* p = new PromiseTask( std::move(signal) ); int64_t priority = int64_t(taskID)<<32; diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index ef63f13c17..87befe9bb7 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -248,7 +248,7 @@ struct Profiler { outOffset += self->environmentInfoWriter.getLength(); loop { - wait( self->network->delay(1.0, TaskMinPriority) || self->network->delay(2.0, TaskMaxPriority) ); + wait( self->network->delay(1.0, TaskPriority::Min) || self->network->delay(2.0, TaskPriority::Max) ); self->enableSignal(false); std::swap( self->output_buffer, otherBuffer ); diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h index 4fdd3c26ff..ed6a9cdc7d 100644 --- a/flow/ThreadHelper.actor.h +++ b/flow/ThreadHelper.actor.h @@ -35,11 +35,11 @@ // void onMainThreadVoid( F f ) { // Promise signal; // doOnMainThreadVoid( signal.getFuture(), f ); -// g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread ); +// g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread ); // } template -void onMainThreadVoid( F f, Error* err, int taskID = TaskDefaultOnMainThread ) { +void onMainThreadVoid( F f, Error* err, TaskPriority taskID = TaskPriority::DefaultOnMainThread ) { Promise signal; doOnMainThreadVoid( signal.getFuture(), f, err ); g_network->onMainThread( std::move(signal), taskID ); @@ -585,7 +585,7 @@ template ThreadFuture< decltype(fake()().getValue()) > onMainThread returnValue->addref(); // For the ThreadFuture we return Future cancelFuture = doOnMainThread()().getValue()), F>( signal.getFuture(), f, returnValue ); returnValue->setCancel( std::move(cancelFuture) ); - g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread ); + g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread ); return ThreadFuture()().getValue())>( returnValue ); } diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 45fcce8d2e..4e70a5d29b 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -630,7 +630,7 @@ void openTraceFile(const NetworkAddress& na, uint64_t rollsize, uint64_t maxLogs std::string baseName = format("%s.%s.%d", baseOfBase.c_str(), ip.c_str(), na.port); g_traceLog.open( directory, baseName, logGroup, format("%lld", time(NULL)), rollsize, maxLogsSize, !g_network->isSimulated() ? na : Optional()); - uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskFlushTrace)); + uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskPriority::FlushTrace)); g_traceBatch.dump(); } diff --git a/flow/flow.h b/flow/flow.h index 7ce23eade7..53f35516eb 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -817,7 +817,7 @@ public: return getReplyPromise(value).getFuture(); } template - Future getReply(const X& value, int taskID) const { + Future getReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); return getReplyPromise(value).getFuture(); } @@ -827,7 +827,7 @@ public: return getReply(Promise()); } template - Future getReplyWithTaskID(int taskID) const { + Future getReplyWithTaskID(TaskPriority taskID) const { Promise reply; reply.getEndpoint(taskID); return getReply(reply); @@ -908,11 +908,11 @@ struct ActorSingleCallback : SingleCallback { } }; inline double now() { return g_network->now(); } -inline Future delay(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds, taskID); } -inline Future delayUntil(double time, int taskID = TaskDefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); } -inline Future delayJittered(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); } -inline Future yield(int taskID = TaskDefaultYield) { return g_network->yield(taskID); } -inline bool check_yield(int taskID = TaskDefaultYield) { return g_network->check_yield(taskID); } +inline Future delay(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds, taskID); } +inline Future delayUntil(double time, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); } +inline Future delayJittered(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); } +inline Future yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->yield(taskID); } +inline bool check_yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->check_yield(taskID); } #include "flow/genericactors.actor.h" #endif diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 7b577b2e4c..fdf02a30d2 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -183,7 +183,7 @@ Future waitForAllReady( std::vector> results ) { } ACTOR template -Future timeout( Future what, double time, T timedoutValue, int taskID = TaskDefaultDelay ) { +Future timeout( Future what, double time, T timedoutValue, TaskPriority taskID = TaskPriority::DefaultDelay ) { Future end = delay( time, taskID ); choose { when( T t = wait( what ) ) { return t; } @@ -201,7 +201,7 @@ Future> timeout( Future what, double time ) { } ACTOR template -Future timeoutError( Future what, double time, int taskID = TaskDefaultDelay ) { +Future timeoutError( Future what, double time, TaskPriority taskID = TaskPriority::DefaultDelay ) { Future end = delay( time, taskID ); choose { when( T t = wait( what ) ) { return t; } @@ -210,7 +210,7 @@ Future timeoutError( Future what, double time, int taskID = TaskDefaultDel } ACTOR template -Future delayed( Future what, double time = 0.0, int taskID = TaskDefaultDelay ) { +Future delayed( Future what, double time = 0.0, TaskPriority taskID = TaskPriority::DefaultDelay ) { try { state T t = wait( what ); wait( delay( time, taskID ) ); @@ -223,7 +223,7 @@ Future delayed( Future what, double time = 0.0, int taskID = TaskDefaultDe } ACTOR template -Future recurring( Func what, double interval, int taskID = TaskDefaultDelay ) { +Future recurring( Func what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay ) { loop choose { when ( wait( delay( interval, taskID ) ) ) { what(); } } @@ -951,7 +951,7 @@ Future quorum(std::vector> const& results, int n) { } ACTOR template -Future smartQuorum( std::vector> results, int required, double extraSeconds, int taskID = TaskDefaultDelay ) { +Future smartQuorum( std::vector> results, int required, double extraSeconds, TaskPriority taskID = TaskPriority::DefaultDelay ) { if (results.empty() && required == 0) return Void(); wait(quorum(results, required)); choose { @@ -1259,7 +1259,7 @@ struct FlowLock : NonCopyable, public ReferenceCounted { FlowLock() : permits(1), active(0) {} explicit FlowLock(int64_t permits) : permits(permits), active(0) {} - Future take(int taskID = TaskDefaultYield, int64_t amount = 1) { + Future take(TaskPriority taskID = TaskPriority::DefaultYield, int64_t amount = 1) { if (active + amount <= permits || active == 0) { active += amount; return safeYieldActor(this, taskID, amount); @@ -1298,7 +1298,7 @@ private: int64_t active; Promise broken_on_destruct; - ACTOR static Future takeActor(FlowLock* lock, int taskID, int64_t amount) { + ACTOR static Future takeActor(FlowLock* lock, TaskPriority taskID, int64_t amount) { state std::list, int64_t>>::iterator it = lock->takers.insert(lock->takers.end(), std::make_pair(Promise(), amount)); try { @@ -1330,7 +1330,7 @@ private: return Void(); } - ACTOR static Future safeYieldActor(FlowLock* lock, int taskID, int64_t amount) { + ACTOR static Future safeYieldActor(FlowLock* lock, TaskPriority taskID, int64_t amount) { try { choose{ when(wait(yield(taskID))) {} @@ -1351,7 +1351,7 @@ private: }; ACTOR template -Future yieldPromiseStream( FutureStream input, PromiseStream output, int taskID = TaskDefaultYield ) { +Future yieldPromiseStream( FutureStream input, PromiseStream output, TaskPriority taskID = TaskPriority::DefaultYield ) { loop { T f = waitNext( input ); output.send( f ); diff --git a/flow/network.h b/flow/network.h index 256bc89b40..bb4841a97d 100644 --- a/flow/network.h +++ b/flow/network.h @@ -31,55 +31,64 @@ #include "flow/IRandom.h" #include "fdbrpc/crc32c.h" -enum { - TaskMaxPriority = 1000000, - TaskRunCycleFunction = 20000, - TaskFlushTrace = 10500, - TaskWriteSocket = 10000, - TaskPollEIO = 9900, - TaskDiskIOComplete = 9150, - TaskLoadBalancedEndpoint = 9000, - TaskReadSocket = 9000, - TaskCoordinationReply = 8810, - TaskCoordination = 8800, - TaskFailureMonitor = 8700, - TaskResolutionMetrics = 8700, - TaskClusterController = 8650, - TaskProxyStorageRejoin = 8645, - TaskProxyCommitDispatcher = 8640, - TaskTLogQueuingMetrics = 8620, - TaskTLogPop = 8610, - TaskTLogPeekReply = 8600, - TaskTLogPeek = 8590, - TaskTLogCommitReply = 8580, - TaskTLogCommit = 8570, - TaskProxyGetRawCommittedVersion = 8565, - TaskProxyResolverReply = 8560, - TaskProxyCommitBatcher = 8550, - TaskProxyCommit = 8540, - TaskTLogConfirmRunningReply = 8530, - TaskTLogConfirmRunning = 8520, - TaskProxyGRVTimer = 8510, - TaskProxyGetConsistentReadVersion = 8500, - TaskDefaultPromiseEndpoint = 8000, - TaskDefaultOnMainThread = 7500, - TaskDefaultDelay = 7010, - TaskDefaultYield = 7000, - TaskDiskRead = 5010, - TaskDefaultEndpoint = 5000, - TaskUnknownEndpoint = 4000, - TaskMoveKeys = 3550, - TaskDataDistributionLaunch = 3530, - TaskRatekeeper = 3510, - TaskDataDistribution = 3500, - TaskDiskWrite = 3010, - TaskUpdateStorage = 3000, - TaskTLogSpilledPeekReply = 2800, - TaskLowPriority = 2000, +enum class TaskPriority { + Max = 1000000, + RunCycleFunction = 20000, + FlushTrace = 10500, + WriteSocket = 10000, + PollEIO = 9900, + DiskIOComplete = 9150, + LoadBalancedEndpoint = 9000, + ReadSocket = 9000, + CoordinationReply = 8810, + Coordination = 8800, + FailureMonitor = 8700, + ResolutionMetrics = 8700, + ClusterController = 8650, + ProxyStorageRejoin = 8645, + ProxyCommitDispatcher = 8640, + TLogQueuingMetrics = 8620, + TLogPop = 8610, + TLogPeekReply = 8600, + TLogPeek = 8590, + TLogCommitReply = 8580, + TLogCommit = 8570, + ProxyGetRawCommittedVersion = 8565, + ProxyResolverReply = 8560, + ProxyCommitBatcher = 8550, + ProxyCommit = 8540, + TLogConfirmRunningReply = 8530, + TLogConfirmRunning = 8520, + ProxyGRVTimer = 8510, + ProxyGetConsistentReadVersion = 8500, + DefaultPromiseEndpoint = 8000, + DefaultOnMainThread = 7500, + DefaultDelay = 7010, + DefaultYield = 7000, + DiskRead = 5010, + DefaultEndpoint = 5000, + UnknownEndpoint = 4000, + MoveKeys = 3550, + DataDistributionLaunch = 3530, + Ratekeeper = 3510, + DataDistribution = 3500, + DiskWrite = 3010, + UpdateStorage = 3000, + TLogSpilledPeekReply = 2800, + Low = 2000, - TaskMinPriority = 1000 + Min = 1000, + Zero = 0 }; +inline TaskPriority incrementPriority(TaskPriority p) { + return static_cast( static_cast(p) + 1 ); +} + +inline TaskPriority decrementPriority(TaskPriority p) { + return static_cast( static_cast(p) + 1 ); +} + class Void; template class Optional; @@ -270,7 +279,7 @@ struct NetworkMetrics { uint64_t countSlowEvents[SLOW_EVENT_BINS]; enum { PRIORITY_BINS = 9 }; - int priorityBins[ PRIORITY_BINS ]; + TaskPriority priorityBins[ PRIORITY_BINS ]; double secSquaredPriorityBlocked[PRIORITY_BINS]; double oldestAlternativesFailure; @@ -372,19 +381,19 @@ public: // Provides a clock that advances at a similar rate on all connected endpoints // FIXME: Return a fixed point Time class - virtual Future delay( double seconds, int taskID ) = 0; + virtual Future delay( double seconds, TaskPriority taskID ) = 0; // The given future will be set after seconds have elapsed - virtual Future yield( int taskID ) = 0; + virtual Future yield( TaskPriority taskID ) = 0; // The given future will be set immediately or after higher-priority tasks have executed - virtual bool check_yield( int taskID ) = 0; + virtual bool check_yield( TaskPriority taskID ) = 0; // Returns true if a call to yield would result in a delay - virtual int getCurrentTask() = 0; + virtual TaskPriority getCurrentTask() = 0; // Gets the taskID/priority of the current task - virtual void setCurrentTask(int taskID ) = 0; + virtual void setCurrentTask(TaskPriority taskID ) = 0; // Sets the taskID/priority of the current task, without yielding virtual flowGlobalType global(int id) = 0; @@ -396,7 +405,7 @@ public: virtual bool isSimulated() const = 0; // Returns true if this network is a local simulation - virtual void onMainThread( Promise&& signal, int taskID ) = 0; + virtual void onMainThread( Promise&& signal, TaskPriority taskID ) = 0; // Executes signal.send(Void()) on a/the thread belonging to this network virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void *), void *arg) = 0; From 8e28930d12b1a94a2c62b827b1ba43b5d31bb013 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 25 Jun 2019 10:36:32 -0700 Subject: [PATCH 007/136] Fix another hardcoded priority. --- fdbclient/VersionedMap.actor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/VersionedMap.actor.h b/fdbclient/VersionedMap.actor.h index 953c2f4c1f..53ba85097f 100644 --- a/fdbclient/VersionedMap.actor.h +++ b/fdbclient/VersionedMap.actor.h @@ -31,7 +31,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. ACTOR template -Future deferredCleanupActor( std::vector toFree, TaskPriority taskID = 7000 ) { +Future deferredCleanupActor( std::vector toFree, TaskPriority taskID = TaskPriority::DefaultYield ) { state int freeCount = 0; while (!toFree.empty()) { Tree a = std::move( toFree.back() ); From d7c00f9cd29d50d099c5ebd25346a1cc8715e339 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 25 Jun 2019 14:19:56 -0700 Subject: [PATCH 008/136] And another. --- fdbclient/VersionedMap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h index 58c440c679..f56b883892 100644 --- a/fdbclient/VersionedMap.h +++ b/fdbclient/VersionedMap.h @@ -511,7 +511,7 @@ public: oldestVersion = newOldestVersion; } - Future forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = 7000 ) { + Future forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = TaskPriority::DefaultYield ) { ASSERT( newOldestVersion <= latestVersion ); roots[newOldestVersion] = getRoot(newOldestVersion); From b5af601a8a618b779635c12d3c5995757cc52787 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 25 Jun 2019 21:41:43 -0700 Subject: [PATCH 009/136] Fix ExternalWorkload not being a part of the old build/test system. --- fdbserver/fdbserver.vcxproj | 1 + fdbserver/workloads/ExternalWorkload.actor.cpp | 2 +- tests/CMakeLists.txt | 2 +- tests/{fast => }/SimpleExternalTest.txt | 0 4 files changed, 3 insertions(+), 2 deletions(-) rename tests/{fast => }/SimpleExternalTest.txt (100%) diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 44e752cdb3..2dd4cb17db 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -157,6 +157,7 @@ + diff --git a/fdbserver/workloads/ExternalWorkload.actor.cpp b/fdbserver/workloads/ExternalWorkload.actor.cpp index c967bc1655..69715def42 100644 --- a/fdbserver/workloads/ExternalWorkload.actor.cpp +++ b/fdbserver/workloads/ExternalWorkload.actor.cpp @@ -21,7 +21,7 @@ #include "flow/ThreadHelper.actor.h" #include "flow/Platform.h" #include "fdbclient/ThreadSafeTransaction.h" -#include "foundationdb/ClientWorkload.h" +#include "bindings/c/foundationdb/ClientWorkload.h" #include "fdbserver/workloads/workloads.actor.h" #include "flow/actorcompiler.h" // has to be last include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 80ef93456a..c90fc36ceb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -63,6 +63,7 @@ add_fdb_test(TEST_FILES ReadAbsent.txt IGNORE) add_fdb_test(TEST_FILES ReadHalfAbsent.txt IGNORE) add_fdb_test(TEST_FILES RedwoodCorrectness.txt IGNORE) add_fdb_test(TEST_FILES RedwoodPerfTests.txt IGNORE) +add_fdb_test(TEST_FILES SimpleExternalTest.txt) add_fdb_test(TEST_FILES SlowTask.txt IGNORE) add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE) add_fdb_test(TEST_FILES StreamingWrite.txt IGNORE) @@ -109,7 +110,6 @@ add_fdb_test(TEST_FILES fast/RandomUnitTests.txt) add_fdb_test(TEST_FILES fast/SelectorCorrectness.txt) add_fdb_test(TEST_FILES fast/Sideband.txt) add_fdb_test(TEST_FILES fast/SidebandWithStatus.txt) -add_fdb_test(TEST_FILES fast/SimpleExternalTest.txt) add_fdb_test(TEST_FILES fast/SnapTestFailAndDisablePop.txt) add_fdb_test(TEST_FILES fast/SwizzledRollbackSideband.txt) add_fdb_test(TEST_FILES fast/SystemRebootTestCycle.txt) diff --git a/tests/fast/SimpleExternalTest.txt b/tests/SimpleExternalTest.txt similarity index 100% rename from tests/fast/SimpleExternalTest.txt rename to tests/SimpleExternalTest.txt From 7f2381484147057ef2cf8618c7fd944183610150 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 26 Jun 2019 14:03:02 -0700 Subject: [PATCH 010/136] Track run loop busyness and report it in status. --- .../source/mr-status-json-schemas.rst.inc | 3 +- documentation/sphinx/source/release-notes.rst | 2 + fdbclient/Schemas.cpp | 3 +- fdbserver/Status.actor.cpp | 97 +++++++++++-------- flow/Net2.actor.cpp | 34 ++++--- flow/SystemMonitor.cpp | 29 ++++-- flow/network.h | 3 + 7 files changed, 109 insertions(+), 62 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 5b0099f142..ad5d6d95b5 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -187,7 +187,8 @@ "megabits_received":{ "hz":0.0 } - } + }, + "run_loop_busy":0.2 // fraction of time the run loop was busy } }, "old_logs":[ diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index f2a9813030..05654f5d14 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -17,6 +17,8 @@ Fixes Status ------ +* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #) `_. + Bindings -------- diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 2e3db10c40..f1f2c5e305 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -207,7 +207,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "megabits_received":{ "hz":0.0 } - } + }, + "run_loop_busy":0.2 } }, "old_logs":[ diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 47c61aeb9f..2a8e8cf27c 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -315,10 +315,10 @@ static JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics, vector 0){ - cpuObj["logical_core_utilization"] = std::max(0.0, std::min(cpu_seconds / elapsed, 1.0)); + cpuObj["logical_core_utilization"] = std::max(0.0, std::min(cpuSeconds / elapsed, 1.0)); } statusObj["cpu"] = cpuObj; @@ -541,8 +541,8 @@ struct RolesInfo { ACTOR static Future processStatusFetcher( Reference> db, std::vector workers, WorkerEvents pMetrics, - WorkerEvents mMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors, WorkerEvents programStarts, - std::map> processIssues, + WorkerEvents mMetrics, WorkerEvents nMetrics, WorkerEvents errors, WorkerEvents traceFileOpenErrors, + WorkerEvents programStarts, std::map> processIssues, vector> storageServers, vector> tLogs, vector> proxies, Database cx, Optional configuration, Optional healthyZone, std::set* incomplete_reasons) { @@ -668,84 +668,84 @@ ACTOR static Future processStatusFetcher( ASSERT(pMetrics.count(workerItr->interf.address())); NetworkAddress address = workerItr->interf.address(); - const TraceEventFields& event = pMetrics[workerItr->interf.address()]; + const TraceEventFields& processMetrics = pMetrics[workerItr->interf.address()]; statusObj["address"] = address.toString(); JsonBuilderObject memoryObj; - if (event.size() > 0) { - std::string zoneID = event.getValue("ZoneID"); + if (processMetrics.size() > 0) { + std::string zoneID = processMetrics.getValue("ZoneID"); statusObj["fault_domain"] = zoneID; if(healthyZone.present() && healthyZone == workerItr->interf.locality.zoneId()) { statusObj["under_maintenance"] = true; } - std::string MachineID = event.getValue("MachineID"); + std::string MachineID = processMetrics.getValue("MachineID"); statusObj["machine_id"] = MachineID; statusObj["locality"] = getLocalityInfo(workerItr->interf.locality); - statusObj.setKeyRawNumber("uptime_seconds",event.getValue("UptimeSeconds")); + statusObj.setKeyRawNumber("uptime_seconds", processMetrics.getValue("UptimeSeconds")); // rates are calculated over the last elapsed seconds - double elapsed = event.getDouble("Elapsed"); - double cpu_seconds = event.getDouble("CPUSeconds"); - double diskIdleSeconds = event.getDouble("DiskIdleSeconds"); - double diskReads = event.getDouble("DiskReads"); - double diskWrites = event.getDouble("DiskWrites"); + double processMetricsElapsed = processMetrics.getDouble("Elapsed"); + double cpuSeconds = processMetrics.getDouble("CPUSeconds"); + double diskIdleSeconds = processMetrics.getDouble("DiskIdleSeconds"); + double diskReads = processMetrics.getDouble("DiskReads"); + double diskWrites = processMetrics.getDouble("DiskWrites"); JsonBuilderObject diskObj; - if (elapsed > 0){ + if (processMetricsElapsed > 0) { JsonBuilderObject cpuObj; - cpuObj["usage_cores"] = std::max(0.0, cpu_seconds / elapsed); + cpuObj["usage_cores"] = std::max(0.0, cpuSeconds / processMetricsElapsed); statusObj["cpu"] = cpuObj; - diskObj["busy"] = std::max(0.0, std::min((elapsed - diskIdleSeconds) / elapsed, 1.0)); + diskObj["busy"] = std::max(0.0, std::min((processMetricsElapsed - diskIdleSeconds) / processMetricsElapsed, 1.0)); JsonBuilderObject readsObj; - readsObj.setKeyRawNumber("counter",event.getValue("DiskReadsCount")); - if (elapsed > 0) - readsObj["hz"] = diskReads / elapsed; - readsObj.setKeyRawNumber("sectors",event.getValue("DiskReadSectors")); + readsObj.setKeyRawNumber("counter", processMetrics.getValue("DiskReadsCount")); + if (processMetricsElapsed > 0) + readsObj["hz"] = diskReads / processMetricsElapsed; + readsObj.setKeyRawNumber("sectors", processMetrics.getValue("DiskReadSectors")); JsonBuilderObject writesObj; - writesObj.setKeyRawNumber("counter",event.getValue("DiskWritesCount")); - if (elapsed > 0) - writesObj["hz"] = diskWrites / elapsed; - writesObj.setKeyRawNumber("sectors",event.getValue("DiskWriteSectors")); + writesObj.setKeyRawNumber("counter", processMetrics.getValue("DiskWritesCount")); + if (processMetricsElapsed > 0) + writesObj["hz"] = diskWrites / processMetricsElapsed; + writesObj.setKeyRawNumber("sectors", processMetrics.getValue("DiskWriteSectors")); diskObj["reads"] = readsObj; diskObj["writes"] = writesObj; } - diskObj.setKeyRawNumber("total_bytes",event.getValue("DiskTotalBytes")); - diskObj.setKeyRawNumber("free_bytes",event.getValue("DiskFreeBytes")); + diskObj.setKeyRawNumber("total_bytes", processMetrics.getValue("DiskTotalBytes")); + diskObj.setKeyRawNumber("free_bytes", processMetrics.getValue("DiskFreeBytes")); statusObj["disk"] = diskObj; JsonBuilderObject networkObj; - networkObj.setKeyRawNumber("current_connections",event.getValue("CurrentConnections")); + networkObj.setKeyRawNumber("current_connections", processMetrics.getValue("CurrentConnections")); JsonBuilderObject connections_established; - connections_established.setKeyRawNumber("hz",event.getValue("ConnectionsEstablished")); + connections_established.setKeyRawNumber("hz", processMetrics.getValue("ConnectionsEstablished")); networkObj["connections_established"] = connections_established; JsonBuilderObject connections_closed; - connections_closed.setKeyRawNumber("hz",event.getValue("ConnectionsClosed")); + connections_closed.setKeyRawNumber("hz", processMetrics.getValue("ConnectionsClosed")); networkObj["connections_closed"] = connections_closed; JsonBuilderObject connection_errors; - connection_errors.setKeyRawNumber("hz",event.getValue("ConnectionErrors")); + connection_errors.setKeyRawNumber("hz", processMetrics.getValue("ConnectionErrors")); networkObj["connection_errors"] = connection_errors; JsonBuilderObject megabits_sent; - megabits_sent.setKeyRawNumber("hz",event.getValue("MbpsSent")); + megabits_sent.setKeyRawNumber("hz", processMetrics.getValue("MbpsSent")); networkObj["megabits_sent"] = megabits_sent; JsonBuilderObject megabits_received; - megabits_received.setKeyRawNumber("hz",event.getValue("MbpsReceived")); + megabits_received.setKeyRawNumber("hz", processMetrics.getValue("MbpsReceived")); networkObj["megabits_received"] = megabits_received; statusObj["network"] = networkObj; - memoryObj.setKeyRawNumber("used_bytes",event.getValue("Memory")); - memoryObj.setKeyRawNumber("unused_allocated_memory",event.getValue("UnusedAllocatedMemory")); + memoryObj.setKeyRawNumber("used_bytes", processMetrics.getValue("Memory")); + memoryObj.setKeyRawNumber("unused_allocated_memory", processMetrics.getValue("UnusedAllocatedMemory")); } if (programStarts.count(address)) { @@ -820,6 +820,19 @@ ACTOR static Future processStatusFetcher( if(workerItr->degraded) { statusObj["degraded"] = true; } + + const TraceEventFields& networkMetrics = nMetrics[workerItr->interf.address()]; + double networkMetricsElapsed = networkMetrics.getDouble("Elapsed"); + + try { + double runLoopBusy = networkMetrics.getDouble("PriorityBusy1"); + statusObj["run_loop_busy"] = runLoopBusy / networkMetricsElapsed; + } + catch(Error &e) { + // This should only happen very early in the process lifetime before priority bin info has been populated + incomplete_reasons->insert("Cannot retrieve run loop busyness."); + } + } catch (Error& e){ // Something strange occurred, process list is incomplete but what was built so far, if anything, will be returned. @@ -1905,6 +1918,7 @@ ACTOR Future clusterGetStatus( std::vector< Future< Optional >> > > futures; futures.push_back(latestEventOnWorkers(workers, "MachineMetrics")); futures.push_back(latestEventOnWorkers(workers, "ProcessMetrics")); + futures.push_back(latestEventOnWorkers(workers, "NetworkMetrics")); futures.push_back(latestErrorOnWorkers(workers)); futures.push_back(latestEventOnWorkers(workers, "TraceFileOpenError")); futures.push_back(latestEventOnWorkers(workers, "ProgramStart")); @@ -1944,9 +1958,10 @@ ACTOR Future clusterGetStatus( state WorkerEvents mMetrics = workerEventsVec[0].present() ? workerEventsVec[0].get().first : WorkerEvents(); // process metrics state WorkerEvents pMetrics = workerEventsVec[1].present() ? workerEventsVec[1].get().first : WorkerEvents(); - state WorkerEvents latestError = workerEventsVec[2].present() ? workerEventsVec[2].get().first : WorkerEvents(); - state WorkerEvents traceFileOpenErrors = workerEventsVec[3].present() ? workerEventsVec[3].get().first : WorkerEvents(); - state WorkerEvents programStarts = workerEventsVec[4].present() ? workerEventsVec[4].get().first : WorkerEvents(); + state WorkerEvents networkMetrics = workerEventsVec[2].present() ? workerEventsVec[2].get().first : WorkerEvents(); + state WorkerEvents latestError = workerEventsVec[3].present() ? workerEventsVec[3].get().first : WorkerEvents(); + state WorkerEvents traceFileOpenErrors = workerEventsVec[4].present() ? workerEventsVec[4].get().first : WorkerEvents(); + state WorkerEvents programStarts = workerEventsVec[5].present() ? workerEventsVec[5].get().first : WorkerEvents(); state JsonBuilderObject statusObj; if(db->get().recoveryCount > 0) { @@ -2089,7 +2104,11 @@ ACTOR Future clusterGetStatus( statusObj["layers"] = layers; } - JsonBuilderObject processStatus = wait(processStatusFetcher(db, workers, pMetrics, mMetrics, latestError, traceFileOpenErrors, programStarts, processIssues, storageServers, tLogs, proxies, cx, configuration, loadResult.present() ? loadResult.get().healthyZone : Optional(), &status_incomplete_reasons)); + JsonBuilderObject processStatus = wait(processStatusFetcher(db, workers, pMetrics, mMetrics, networkMetrics, + latestError, traceFileOpenErrors, programStarts, + processIssues, storageServers, tLogs, proxies, cx, + configuration, loadResult.present() ? loadResult.get().healthyZone : Optional(), + &status_incomplete_reasons)); statusObj["processes"] = processStatus; statusObj["clients"] = clientStatusFetcher(clientVersionMap, clientStatusInfoMap); diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 2dcf9783ed..ef6426936d 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -168,7 +168,6 @@ public: double lastPriorityTrackTime; int lastMinTaskID; - double priorityTimer[NetworkMetrics::PRIORITY_BINS]; std::priority_queue> ready; ThreadSafeQueue threadReady; @@ -577,7 +576,8 @@ void Net2::run() { if (runFunc) { tsc_begin = __rdtsc(); - taskBegin = timer_monotonic(); + taskBegin = nnow; + trackMinPriority(TaskRunCycleFunction, taskBegin); runFunc(); checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskRunCycleFunction); } @@ -591,8 +591,11 @@ void Net2::run() { ++countWontSleep; if (b) { sleepTime = 1e99; - if (!timers.empty()) - sleepTime = timers.top().at - timer_monotonic(); // + 500e-6? + double sleepStart = timer_monotonic(); + if (!timers.empty()) { + sleepTime = timers.top().at - sleepStart; // + 500e-6? + } + trackMinPriority(0, sleepStart); } awakeMetric = false; @@ -607,7 +610,6 @@ void Net2::run() { if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow); - if (sleepTime) trackMinPriority( 0, now ); while (!timers.empty() && timers.top().at < now) { ++countTimers; ready.push( timers.top() ); @@ -641,7 +643,7 @@ void Net2::run() { if (check_yield(TaskMaxPriority, true)) { ++countYields; break; } } - nnow = timer_monotonic(); + trackMinPriority(minTaskID, now); #if defined(__linux__) if(FLOW_KNOBS->SLOWTASK_PROFILING_INTERVAL > 0) { @@ -685,11 +687,10 @@ void Net2::run() { net2liveness.fetch_add(1); } #endif + nnow = timer_monotonic(); if ((nnow-now) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (nnow-now)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopBottom").detail("Elapsed", nnow - now); // This includes the time spent running tasks - - trackMinPriority( minTaskID, nnow ); } #ifdef WIN32 @@ -698,17 +699,22 @@ void Net2::run() { } void Net2::trackMinPriority( int minTaskID, double now ) { - if (minTaskID != lastMinTaskID) + if (minTaskID != lastMinTaskID) { for(int c=0; c= minTaskID && pri < lastMinTaskID) { // busy -> idle - double busyFor = lastPriorityTrackTime - priorityTimer[c]; - networkMetrics.secSquaredPriorityBlocked[c] += busyFor*busyFor; + if (pri > minTaskID && pri <= lastMinTaskID) { // busy -> idle + double busyFor = lastPriorityTrackTime - networkMetrics.priorityTimer[c]; + networkMetrics.priorityBlocked[c] = false; + networkMetrics.priorityBlockedDuration[c] += busyFor; + networkMetrics.secSquaredPriorityBlocked[c] += busyFor * busyFor; } - if (pri < minTaskID && pri >= lastMinTaskID) { // idle -> busy - priorityTimer[c] = now; + if (pri <= minTaskID && pri > lastMinTaskID) { // idle -> busy + networkMetrics.priorityBlocked[c] = true; + networkMetrics.priorityTimer[c] = now; } } + } + lastMinTaskID = minTaskID; lastPriorityTrackTime = now; } diff --git a/flow/SystemMonitor.cpp b/flow/SystemMonitor.cpp index fc778717e9..4481cba0f0 100644 --- a/flow/SystemMonitor.cpp +++ b/flow/SystemMonitor.cpp @@ -59,8 +59,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta netData.init(); if (!DEBUG_DETERMINISM && currentStats.initialized) { { - TraceEvent e(eventName.c_str()); - e + TraceEvent(eventName.c_str()) .detail("Elapsed", currentStats.elapsed) .detail("CPUSeconds", currentStats.processCPUSeconds) .detail("MainThreadCPUSeconds", currentStats.mainThreadCPUSeconds) @@ -120,6 +119,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta TraceEvent n("NetworkMetrics"); n + .detail("Elapsed", currentStats.elapsed) .detail("CantSleep", netData.countCantSleep - statState->networkState.countCantSleep) .detail("WontSleep", netData.countWontSleep - statState->networkState.countWontSleep) .detail("Yields", netData.countYields - statState->networkState.countYields) @@ -139,12 +139,27 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta .detail("PacketsGenerated", netData.countPacketsGenerated - statState->networkState.countPacketsGenerated) .detail("WouldBlock", netData.countWouldBlock - statState->networkState.countWouldBlock); - for (int i = 0; inetworkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) + for (int i = 0; inetworkMetrics.countSlowEvents[i] - statState->networkMetricsState.countSlowEvents[i]) { n.detail(format("SlowTask%dM", 1 << i).c_str(), c); - for (int i = 0; inetworkMetrics.secSquaredPriorityBlocked[i] - statState->networkMetricsState.secSquaredPriorityBlocked[i]) - n.detail(format("S2Pri%d", g_network->networkMetrics.priorityBins[i]).c_str(), x); + } + } + + for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != 0; i++) { + if(g_network->networkMetrics.priorityBlocked[i]) { + double lastSegment = std::min(currentStats.elapsed, now() - g_network->networkMetrics.priorityTimer[i]); + g_network->networkMetrics.priorityBlockedDuration[i] += lastSegment; + g_network->networkMetrics.secSquaredPriorityBlocked[i] += lastSegment * lastSegment; + g_network->networkMetrics.priorityTimer[i] = now(); + } + + double blocked = g_network->networkMetrics.priorityBlockedDuration[i] - statState->networkMetricsState.priorityBlockedDuration[i]; + double s2Blocked = g_network->networkMetrics.secSquaredPriorityBlocked[i] - statState->networkMetricsState.secSquaredPriorityBlocked[i]; + n.detail(format("PriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), blocked); + n.detail(format("SumOfSquaredPriorityBusy%d", g_network->networkMetrics.priorityBins[i]).c_str(), s2Blocked); + } + + n.trackLatest("NetworkMetrics"); } if(machineMetrics) { diff --git a/flow/network.h b/flow/network.h index 256bc89b40..55284fb39f 100644 --- a/flow/network.h +++ b/flow/network.h @@ -271,7 +271,10 @@ struct NetworkMetrics { enum { PRIORITY_BINS = 9 }; int priorityBins[ PRIORITY_BINS ]; + bool priorityBlocked[PRIORITY_BINS]; + double priorityBlockedDuration[PRIORITY_BINS]; double secSquaredPriorityBlocked[PRIORITY_BINS]; + double priorityTimer[PRIORITY_BINS]; double oldestAlternativesFailure; double newestAlternativesFailure; From 7e70fa7fcb6f0ec3d8eaba3c6ba4a12755b5110c Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 26 Jun 2019 14:10:08 -0700 Subject: [PATCH 011/136] Add pull request number to release notes. --- documentation/sphinx/source/release-notes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 05654f5d14..c8bf7fbac7 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -17,7 +17,7 @@ Fixes Status ------ -* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #) `_. +* Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) `_. Bindings -------- From 08f28e99f96e9339339364fc9524c8326e06e664 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 13:47:45 -0700 Subject: [PATCH 012/136] TeamCollection:Test no server or machine has incorrect team number Add test for simulation test which make sure the server team number per server will be no less than the desired_teams_per_server defined in knobs and no larger than the max_teams_per_server. Add similar test for machine teams number per machine as well. --- fdbserver/DataDistribution.actor.cpp | 49 ++++++++++++++++++++++++++++ fdbserver/QuietDatabase.actor.cpp | 19 +++++++++-- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 7b29ce0bdf..2f6a9bbcd2 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1591,6 +1591,34 @@ struct DDTeamCollection : ReferenceCounted { return totalHealthyMachineCount; } + std::pair calculateMinMaxServerTeamNumOnServer() { + int minTeamNumber = std::numeric_limits::max(); + int maxTeamNumber = std::numeric_limits::min(); + for (auto& server : server_info ) { + if ( server.second->teams.size() < minTeamNumber ) { + minTeamNumber = server.second->teams.size(); + } + if ( server.second->teams.size() > maxTeamNumber ) { + maxTeamNumber = server.second->teams.size(); + } + } + return std::make_pair(minTeamNumber, maxTeamNumber); + } + + std::pair calculateMinMaxMachineTeamNumOnMachine() { + int minTeamNumber = std::numeric_limits::max(); + int maxTeamNumber = std::numeric_limits::min(); + for (auto& machine : machine_info) { + if ( machine.second->machineTeams.size() < minTeamNumber ) { + minTeamNumber = machine.second->machineTeams.size(); + } + if ( machine.second->machineTeams.size() > maxTeamNumber ) { + maxTeamNumber = machine.second->machineTeams.size(); + } + } + return std::make_pair(minTeamNumber, maxTeamNumber); + } + // Sanity check bool isServerTeamNumberCorrect(Reference& mt) { int num = 0; @@ -1762,6 +1790,9 @@ struct DDTeamCollection : ReferenceCounted { healthyMachineTeamCount = getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) .detail("AddedTeamNumber", addedTeams) @@ -1775,6 +1806,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); return addedTeams; @@ -1791,6 +1826,9 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) .detail("AddedTeamNumber", 0) @@ -1804,6 +1842,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); // Debug purpose @@ -1901,6 +1943,9 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", self->distributorId) .detail("Primary", self->primary) .detail("AddedTeamNumber", 0) @@ -1914,6 +1959,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); } } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index b5be5335dc..126779c4bf 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -289,6 +289,11 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr int64_t healthyMachineTeamCount = boost::lexical_cast(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber")); int64_t desiredMachineTeamNumber = boost::lexical_cast(teamCollectionInfoMessage.getValue("DesiredMachineTeams")); int64_t maxMachineTeamNumber = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeams")); + + int64_t minServerTeamOnServer = boost::lexical_cast(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer")); + int64_t maxServerTeamOnServer = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer")); + int64_t minMachineTeamOnMachine = boost::lexical_cast(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine")); + int64_t maxMachineTeamOnMachine = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine")); // Team number is always valid when we disable teamRemover. This avoids false positive in simulation test if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) { @@ -299,7 +304,11 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || + minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || + minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || + maxServerTeamOnServer > SERVER_KNOBS->MAX_TEAMS_PER_SERVER || + maxMachineTeamOnMachine > SERVER_KNOBS->MAX_TEAMS_PER_SERVER) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) @@ -307,7 +316,13 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) .detail("DesiredMachineTeams", desiredMachineTeamNumber) .detail("CurrentMachineTeamNumber", currentMachineTeamNumber) - .detail("MaxMachineTeams", maxMachineTeamNumber); + .detail("MaxMachineTeams", maxMachineTeamNumber) + .detail("MinTeamNumberOnServer", minServerTeamOnServer) + .detail("MaxTeamNumberOnServer", maxServerTeamOnServer) + .detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine) + .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) + .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) + .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); return false; } else { return true; From 21664742a6f6388d4ba0bdf4145bb7ce865bcdaa Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 14:59:02 -0700 Subject: [PATCH 013/136] TeamCollection:Desired team number may be larger than the max possible team number For example, we have 3 servers for replica factor 3. We can have only 1 team but the desired team number is 3 times 5 equal to 15. Instead of sanity checking the absolute team number per server, we check the difference between the minServerTeamOnServer and maxServerTeamOnServer. --- fdbserver/DataDistribution.actor.cpp | 12 ++++++------ fdbserver/QuietDatabase.actor.cpp | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 2f6a9bbcd2..db75ce76e1 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1591,9 +1591,9 @@ struct DDTeamCollection : ReferenceCounted { return totalHealthyMachineCount; } - std::pair calculateMinMaxServerTeamNumOnServer() { - int minTeamNumber = std::numeric_limits::max(); - int maxTeamNumber = std::numeric_limits::min(); + std::pair calculateMinMaxServerTeamNumOnServer() { + uint32_t minTeamNumber = std::numeric_limits::max(); + uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& server : server_info ) { if ( server.second->teams.size() < minTeamNumber ) { minTeamNumber = server.second->teams.size(); @@ -1605,9 +1605,9 @@ struct DDTeamCollection : ReferenceCounted { return std::make_pair(minTeamNumber, maxTeamNumber); } - std::pair calculateMinMaxMachineTeamNumOnMachine() { - int minTeamNumber = std::numeric_limits::max(); - int maxTeamNumber = std::numeric_limits::min(); + std::pair calculateMinMaxMachineTeamNumOnMachine() { + uint32_t minTeamNumber = std::numeric_limits::max(); + uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& machine : machine_info) { if ( machine.second->machineTeams.size() < minTeamNumber ) { minTeamNumber = machine.second->machineTeams.size(); diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 126779c4bf..927bf08440 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -305,10 +305,10 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. if (healthyMachineTeamCount > desiredMachineTeamNumber || - minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || - minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER || - maxServerTeamOnServer > SERVER_KNOBS->MAX_TEAMS_PER_SERVER || - maxMachineTeamOnMachine > SERVER_KNOBS->MAX_TEAMS_PER_SERVER) { + minServerTeamOnServer <= 0 || + minMachineTeamOnMachine <= 0 || + ( maxServerTeamOnServer > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) || + ( maxMachineTeamOnMachine > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) From ee916b337d2c2cf457b91920ceb932c1bcc81ef9 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 15:31:05 -0700 Subject: [PATCH 014/136] TeamCollection:Change the target team number to build When team collection (TC) build server teams and machine teams, it needs to build enough teams such that each server and machine has the DESIRED_TEAMS_PER_SERVER server teams and machine teams. This change calculate the number of teams (server team and machine teams) needed to get each teams for each server and machine. --- fdbserver/DataDistribution.actor.cpp | 79 ++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index db75ce76e1..5511d57b8d 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1310,7 +1310,7 @@ struct DDTeamCollection : ReferenceCounted { // Five steps to create each machine team, which are document in the function // Reuse ReplicationPolicy selectReplicas func to select machine team // return number of added machine teams - int addBestMachineTeams(int targetMachineTeamsToBuild) { + int addBestMachineTeams(int targetMachineTeamsToBuild, int remainingMachineTeamBudget) { int addedMachineTeams = 0; int totalServerIndex = 0; int machineTeamsToBuild = 0; @@ -1329,7 +1329,7 @@ struct DDTeamCollection : ReferenceCounted { int loopCount = 0; // Add a team in each iteration - while (addedMachineTeams < machineTeamsToBuild) { + while (addedMachineTeams < machineTeamsToBuild || addedMachineTeams < remainingMachineTeamBudget) { // Step 2: Get least used machines from which we choose machines as a machine team std::vector> leastUsedMachines; // A less used machine has less number of teams int minTeamCount = std::numeric_limits::max(); @@ -1432,6 +1432,8 @@ struct DDTeamCollection : ReferenceCounted { addMachineTeam(machines); addedMachineTeams++; + // Update the remaining machine team budget because the budget may decrease by any value between 1 and storageTeamSize + remainingMachineTeamBudget = getRemainingMachineTeamBudget(); } else { TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId) .detail("Primary", primary) @@ -1669,11 +1671,48 @@ struct DDTeamCollection : ReferenceCounted { return healthyTeamCount; } + // Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int getRemainingMachineTeamBudget() { + int remainingMachineTeamBudget = 0; + for ( auto& m : machine_info ) { + int healthyMTCount = 0; + for ( auto& mt : m.second->machineTeams ) { + if ( isMachineTeamHealthy(mt) ) { + ++healthyMTCount; + } + } + remainingMachineTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount)); + } + + // We over-provision the remainingMachineTeamBudget because we do not know when a new machine team is built, how many times it can be counted into the budget + // For example, when a new machine is added, a new machine team only consume 1 such budget + return remainingMachineTeamBudget; + } + + // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + int getRemainingServerTeamBudget() { + // remainingTeamBudget is the number of teams needed to ensure every server has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int remainingTeamBudget = 0; + for ( auto& s : server_info ) { + int numValidTeams = 0; + for ( auto& team : s.second->teams ) { + if ( !team->isWrongConfiguration() && team->isHealthy() ) { + ++numValidTeams; + } + } + remainingTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams)); + } + + return remainingTeamBudget; + } + + // Create server teams based on machine teams // Before the number of machine teams reaches the threshold, build a machine team for each server team // When it reaches the threshold, first try to build a server team with existing machine teams; if failed, // build an extra machine team and record the event in trace - int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber) { + int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) { ASSERT(teamsToBuild > 0); ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0); @@ -1685,8 +1724,8 @@ struct DDTeamCollection : ReferenceCounted { // When we change configuration, we may have machine teams with storageTeamSize in the old configuration. int healthyMachineTeamCount = getHealthyMachineTeamCount(); int totalMachineTeamCount = machineTeams.size(); - int totalHealthyMachineCount = calculateHealthyMachineCount(); + int remainingMachineTeamBudget = getRemainingMachineTeamBudget(); int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; @@ -1699,13 +1738,14 @@ struct DDTeamCollection : ReferenceCounted { .detail("HealthyMachineTeamCount", healthyMachineTeamCount) .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) - .detail("MachineTeamsToBuild", machineTeamsToBuild); + .detail("MachineTeamsToBuild", machineTeamsToBuild) + .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget); // Pre-build all machine teams until we have the desired number of machine teams if (machineTeamsToBuild > 0) { - addedMachineTeams = addBestMachineTeams(machineTeamsToBuild); + addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget); } - while (addedTeams < teamsToBuild) { + while (addedTeams < teamsToBuild || addedTeams < remainingTeamBudget) { // Step 1: Create 1 best machine team std::vector bestServerTeam; int bestScore = std::numeric_limits::max(); @@ -1782,6 +1822,7 @@ struct DDTeamCollection : ReferenceCounted { // Step 4: Add the server team addTeam(bestServerTeam.begin(), bestServerTeam.end(), false); addedTeams++; + remainingTeamBudget = getRemainingServerTeamBudget(); if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) { break; @@ -1901,6 +1942,9 @@ struct DDTeamCollection : ReferenceCounted { totalTeamCount++; } } + // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + // remainingTeamBudget is the number of teams needed to ensure every server has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int remainingTeamBudget = self->getRemainingServerTeamBudget(); // teamsToBuild is calculated such that we will not build too many teams in the situation // when all (or most of) teams become unhealthy temporarily and then healthy again @@ -1927,7 +1971,7 @@ struct DDTeamCollection : ReferenceCounted { // addTeamsBestOf() will not add more teams than needed. // If the team number is more than the desired, the extra teams are added in the code path when // a team is added as an initial team - int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams); + int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams, remainingTeamBudget); if (addedTeams <= 0 && self->teams.size() == 0) { TraceEvent(SevWarn, "NoTeamAfterBuildTeam") @@ -3005,8 +3049,9 @@ ACTOR Future storageServerTracker( if(hasWrongStoreTypeOrDC) self->restartRecruiting.trigger(); - if ( lastIsUnhealthy && !status.isUnhealthy() && !server->teams.size() ) { + if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) { self->doBuildTeams = true; + self->restartTeamBuilder.trigger(); } lastIsUnhealthy = status.isUnhealthy(); @@ -3894,7 +3939,7 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/UseMachineID") { Reference policy = Reference(new PolicyAcross(teamSize, "zoneid", Reference(new PolicyOne()))); state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); - int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams); + collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30); ASSERT(collection->sanityCheckTeams() == true); @@ -3919,8 +3964,8 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") { return Void(); } - collection->addBestMachineTeams(30); // Create machine teams to help debug - int result = collection->addTeamsBestOf(30, desiredTeams, maxTeams); + collection->addBestMachineTeams(30, 30); // Create machine teams to help debug + collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30); collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely if (collection) delete (collection); @@ -3935,7 +3980,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") { state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; state DDTeamCollection* collection = testTeamCollection(3, policy, processSize); - int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams, 200); delete(collection); @@ -3955,7 +4000,7 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { state DDTeamCollection* collection = testTeamCollection(3, policy, processSize); - int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10); delete(collection); @@ -3975,7 +4020,7 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); - int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8); ASSERT(result == 8); @@ -4005,8 +4050,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); - int resultMachineTeams = collection->addBestMachineTeams(10); - int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams); + collection->addBestMachineTeams(10, 10); + int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10); if (collection->machineTeams.size() != 10 || result != 8) { collection->traceAllInfo(true); // Debug message From e1d459075a940719040101f89581933157809baf Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 17:30:29 -0700 Subject: [PATCH 015/136] TeamCollection:Count healthy machine teams only Team collection should prioritize to build machine teams for a machine that has the least number of healthy machine teams, instead of just machine teams, because unhealthy machine team will not be able to produce more server teams. --- fdbserver/DataDistribution.actor.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 5511d57b8d..334b380cd2 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1342,7 +1342,12 @@ struct DDTeamCollection : ReferenceCounted { // Invariant: We only create correct size machine teams. // When configuration (e.g., team size) is changed, the DDTeamCollection will be destroyed and rebuilt // so that the invariant will not be violated. - int teamCount = machine.second->machineTeams.size(); + int teamCount = 0; + for (auto& mt : machine.second->machineTeams) { + if ( isMachineTeamHealthy(mt) ) { + ++teamCount; + } + } if (teamCount < minTeamCount) { leastUsedMachines.clear(); From 02cdcc0b0c09994c1904b01b07f3aa02285eb204 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 17:35:57 -0700 Subject: [PATCH 016/136] TeamCollectionTest: Only ensure each server and machine have a team --- fdbserver/QuietDatabase.actor.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 927bf08440..9edb9d06e3 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -306,9 +306,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // if redundant teams exist. if (healthyMachineTeamCount > desiredMachineTeamNumber || minServerTeamOnServer <= 0 || - minMachineTeamOnMachine <= 0 || - ( maxServerTeamOnServer > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minServerTeamOnServer < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) || - ( maxMachineTeamOnMachine > SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER && minMachineTeamOnMachine < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) ) { + minMachineTeamOnMachine <= 0 ) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) From c23d89c98aa272ec926bea362ee7d3c795d93741 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 17:56:54 -0700 Subject: [PATCH 017/136] TeamCollection:Only count healthy teams for a server When team collection add new server teams, it picks a team with the least number of teams. We should only consider the healthy teams because the unhealthy ones will not be useful. --- fdbserver/DataDistribution.actor.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 334b380cd2..721dcd11df 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1507,7 +1507,12 @@ struct DDTeamCollection : ReferenceCounted { // Only pick healthy server, which is not failed or excluded. if (server_status.get(server.first).isUnhealthy()) continue; - int numTeams = server.second->teams.size(); + int numTeams = 0; + for (auto& t : server.second->teams) { + if (!t->isWrongConfiguration() && t->isHealthy()) { + ++numTeams; + } + } if (numTeams < minTeamNumber) { minTeamNumber = numTeams; leastUsedServers.clear(); @@ -3056,7 +3061,7 @@ ACTOR Future storageServerTracker( if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) { self->doBuildTeams = true; - self->restartTeamBuilder.trigger(); + self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams } lastIsUnhealthy = status.isUnhealthy(); From cc6a0e9bcdae581fae748b3129c22dbd5ab3d014 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 19:33:38 -0700 Subject: [PATCH 018/136] TeamCollectionTest:Do not enforce minServerTeamOnServer larger than 0 In ConfigureTest, one server may be left with 0 server teams, even if we call buildTeams in the storageServerTracker. --- fdbserver/DataDistribution.actor.cpp | 2 +- fdbserver/QuietDatabase.actor.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 721dcd11df..443c47fc7b 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1682,7 +1682,7 @@ struct DDTeamCollection : ReferenceCounted { } // Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, - // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams int getRemainingMachineTeamBudget() { int remainingMachineTeamBudget = 0; for ( auto& m : machine_info ) { diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 9edb9d06e3..7350420775 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -305,7 +305,6 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. if (healthyMachineTeamCount > desiredMachineTeamNumber || - minServerTeamOnServer <= 0 || minMachineTeamOnMachine <= 0 ) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) From 53324e4db753d4738069e3a664a2dfd5fa7a8922 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 19:38:12 -0700 Subject: [PATCH 019/136] TeamCollectionInfo: clang format --- fdbserver/DataDistribution.actor.cpp | 95 +++++++++++++++------------- fdbserver/QuietDatabase.actor.cpp | 29 +++++---- 2 files changed, 66 insertions(+), 58 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 443c47fc7b..3dd70b15e7 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1344,7 +1344,7 @@ struct DDTeamCollection : ReferenceCounted { // so that the invariant will not be violated. int teamCount = 0; for (auto& mt : machine.second->machineTeams) { - if ( isMachineTeamHealthy(mt) ) { + if (isMachineTeamHealthy(mt)) { ++teamCount; } } @@ -1437,7 +1437,8 @@ struct DDTeamCollection : ReferenceCounted { addMachineTeam(machines); addedMachineTeams++; - // Update the remaining machine team budget because the budget may decrease by any value between 1 and storageTeamSize + // Update the remaining machine team budget because the budget may decrease by any value between 1 and + // storageTeamSize remainingMachineTeamBudget = getRemainingMachineTeamBudget(); } else { TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId) @@ -1509,9 +1510,9 @@ struct DDTeamCollection : ReferenceCounted { int numTeams = 0; for (auto& t : server.second->teams) { - if (!t->isWrongConfiguration() && t->isHealthy()) { - ++numTeams; - } + if (!t->isWrongConfiguration() && t->isHealthy()) { + ++numTeams; + } } if (numTeams < minTeamNumber) { minTeamNumber = numTeams; @@ -1606,11 +1607,11 @@ struct DDTeamCollection : ReferenceCounted { std::pair calculateMinMaxServerTeamNumOnServer() { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); - for (auto& server : server_info ) { - if ( server.second->teams.size() < minTeamNumber ) { + for (auto& server : server_info) { + if (server.second->teams.size() < minTeamNumber) { minTeamNumber = server.second->teams.size(); } - if ( server.second->teams.size() > maxTeamNumber ) { + if (server.second->teams.size() > maxTeamNumber) { maxTeamNumber = server.second->teams.size(); } } @@ -1621,10 +1622,10 @@ struct DDTeamCollection : ReferenceCounted { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& machine : machine_info) { - if ( machine.second->machineTeams.size() < minTeamNumber ) { + if (machine.second->machineTeams.size() < minTeamNumber) { minTeamNumber = machine.second->machineTeams.size(); } - if ( machine.second->machineTeams.size() > maxTeamNumber ) { + if (machine.second->machineTeams.size() > maxTeamNumber) { maxTeamNumber = machine.second->machineTeams.size(); } } @@ -1682,41 +1683,43 @@ struct DDTeamCollection : ReferenceCounted { } // Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, - // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams int getRemainingMachineTeamBudget() { - int remainingMachineTeamBudget = 0; - for ( auto& m : machine_info ) { + int remainingMachineTeamBudget = 0; + for (auto& m : machine_info) { int healthyMTCount = 0; - for ( auto& mt : m.second->machineTeams ) { - if ( isMachineTeamHealthy(mt) ) { + for (auto& mt : m.second->machineTeams) { + if (isMachineTeamHealthy(mt)) { ++healthyMTCount; } } - remainingMachineTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount)); + remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - healthyMTCount)); } - // We over-provision the remainingMachineTeamBudget because we do not know when a new machine team is built, how many times it can be counted into the budget - // For example, when a new machine is added, a new machine team only consume 1 such budget + // We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built, + // how many times it can be counted into the budget. For example, when a new machine is added, + // a new machine team only consume 1 such budget return remainingMachineTeamBudget; } // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, int getRemainingServerTeamBudget() { - // remainingTeamBudget is the number of teams needed to ensure every server has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + // remainingTeamBudget is the number of teams needed to ensure every server has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams int remainingTeamBudget = 0; - for ( auto& s : server_info ) { + for (auto& s : server_info) { int numValidTeams = 0; - for ( auto& team : s.second->teams ) { - if ( !team->isWrongConfiguration() && team->isHealthy() ) { + for (auto& team : s.second->teams) { + if (!team->isWrongConfiguration() && team->isHealthy()) { ++numValidTeams; } } - remainingTeamBudget += std::max(0, (int) (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams)); + remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams)); } return remainingTeamBudget; } - // Create server teams based on machine teams // Before the number of machine teams reaches the threshold, build a machine team for each server team @@ -1749,7 +1752,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("MachineTeamsToBuild", machineTeamsToBuild) - .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget); + .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget); // Pre-build all machine teams until we have the desired number of machine teams if (machineTeamsToBuild > 0) { addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget); @@ -1841,8 +1844,8 @@ struct DDTeamCollection : ReferenceCounted { healthyMachineTeamCount = getHealthyMachineTeamCount(); - std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); - std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) @@ -1857,10 +1860,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) - .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) - .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) - .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); return addedTeams; @@ -1877,8 +1880,8 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = getHealthyMachineTeamCount(); - std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); - std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) @@ -1893,10 +1896,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) - .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) - .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) - .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); // Debug purpose @@ -1953,7 +1956,8 @@ struct DDTeamCollection : ReferenceCounted { } } // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, - // remainingTeamBudget is the number of teams needed to ensure every server has SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + // remainingTeamBudget is the number of teams needed to ensure every server has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams int remainingTeamBudget = self->getRemainingServerTeamBudget(); // teamsToBuild is calculated such that we will not build too many teams in the situation @@ -1997,8 +2001,8 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); - std::pair minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer(); - std::pair minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine(); + std::pair minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine(); TraceEvent("TeamCollectionInfo", self->distributorId) .detail("Primary", self->primary) @@ -2013,10 +2017,10 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) - .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) - .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) - .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) - .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) .trackLatest("TeamCollectionInfo"); } } @@ -3059,7 +3063,8 @@ ACTOR Future storageServerTracker( if(hasWrongStoreTypeOrDC) self->restartRecruiting.trigger(); - if ( lastIsUnhealthy && !status.isUnhealthy() && server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER ) { + if (lastIsUnhealthy && !status.isUnhealthy() && + server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) { self->doBuildTeams = true; self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams } diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 7350420775..72bda5cba6 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -289,11 +289,15 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr int64_t healthyMachineTeamCount = boost::lexical_cast(teamCollectionInfoMessage.getValue("CurrentHealthyMachineTeamNumber")); int64_t desiredMachineTeamNumber = boost::lexical_cast(teamCollectionInfoMessage.getValue("DesiredMachineTeams")); int64_t maxMachineTeamNumber = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeams")); - - int64_t minServerTeamOnServer = boost::lexical_cast(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer")); - int64_t maxServerTeamOnServer = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer")); - int64_t minMachineTeamOnMachine = boost::lexical_cast(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine")); - int64_t maxMachineTeamOnMachine = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine")); + + int64_t minServerTeamOnServer = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer")); + int64_t maxServerTeamOnServer = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer")); + int64_t minMachineTeamOnMachine = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine")); + int64_t maxMachineTeamOnMachine = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine")); // Team number is always valid when we disable teamRemover. This avoids false positive in simulation test if (SERVER_KNOBS->TR_FLAG_DISABLE_TEAM_REMOVER) { @@ -304,8 +308,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber || - minMachineTeamOnMachine <= 0 ) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) @@ -314,12 +317,12 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("DesiredMachineTeams", desiredMachineTeamNumber) .detail("CurrentMachineTeamNumber", currentMachineTeamNumber) .detail("MaxMachineTeams", maxMachineTeamNumber) - .detail("MinTeamNumberOnServer", minServerTeamOnServer) - .detail("MaxTeamNumberOnServer", maxServerTeamOnServer) - .detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine) - .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) - .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) - .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); + .detail("MinTeamNumberOnServer", minServerTeamOnServer) + .detail("MaxTeamNumberOnServer", maxServerTeamOnServer) + .detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine) + .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) + .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) + .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); return false; } else { return true; From aaf97542e9e409b5989bb75437547ff2dcf030ec Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 26 Jun 2019 22:37:34 -0700 Subject: [PATCH 020/136] TeamCollectionTest: Update unit test --- fdbserver/DataDistribution.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3dd70b15e7..a7b1f99d11 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4019,7 +4019,7 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { delete(collection); - ASSERT(result == 10); + ASSERT(result >= 10); return Void(); } @@ -4037,7 +4037,7 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8); - ASSERT(result == 8); + ASSERT(result >= 8); for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) { auto teamCount = process->second->teams.size(); From 90c158984c6b0fec6870b0468750bbdee26c67ba Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 11:12:48 -0700 Subject: [PATCH 021/136] TeamCollection:Add extra trace events --- fdbserver/DataDistribution.actor.cpp | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index a7b1f99d11..f89e7d5cc9 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1669,6 +1669,26 @@ struct DDTeamCollection : ReferenceCounted { return std::pair, int>(retMT, minNumProcessTeams); } + // Find the machine team with the largest number of server teams + std::pair, int> getMachineTeamWithMostProcessTeams() { + Reference retMT; + int maxNumProcessTeams = std::numeric_limits::min(); + + for (auto& mt : machineTeams) { + if (EXPENSIVE_VALIDATION) { + ASSERT(isServerTeamNumberCorrect(mt)); + } + int size = mt->serverTeams.size(); + if ( size > maxNumProcessTeams) { + maxNumProcessTeams = mt->serverTeams.size(); + retMT = mt; + } + } + + return std::pair, int>(retMT, maxNumProcessTeams); + } + + int getHealthyMachineTeamCount() { int healthyTeamCount = 0; for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) { @@ -1864,6 +1884,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); return addedTeams; @@ -1900,6 +1921,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); // Debug purpose @@ -2021,6 +2043,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); } } @@ -2418,6 +2441,14 @@ ACTOR Future teamRemover(DDTeamCollection* self) { team = mt->serverTeams[teamIndex]; ASSERT(team->machineTeam->machineIDs == mt->machineIDs); // Sanity check + // Check if a server will have 0 team after the team is removed + for (auto& s : team->getServers()) { + if ( s->teams.size() == 0 ) { + TraceEvent(SevError, "TeamRemoverTooAggressive").detail("Server", s->id).detail("Team", team->getServerIDsStr()); + self->traceAllInfo(true); + } + } + // The team will be marked as a bad team bool foundTeam = self->removeTeam(team); ASSERT(foundTeam == true); From 5f5c4042919793996b65a3bbf9b3da5d08c46125 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 13:47:46 -0700 Subject: [PATCH 022/136] BugFix:ReplicationPolicy always fails when teamSize is 1 Whenever use selectReplicas function, be careful that it may have bugs! This bug is that it always return false (not able to find candidates) when the storage team size is 1. This is wrong because when storage team size is 1, the selectReplicas should return an empty result. --- fdbserver/DataDistribution.actor.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index f89e7d5cc9..b93aecc76f 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1367,6 +1367,7 @@ struct DDTeamCollection : ReferenceCounted { if (leastUsedMachines.size()) { // Randomly choose 1 least used machine Reference tcMachineInfo = g_random->randomChoice(leastUsedMachines); + TraceEvent("MXDEBUG", distributorId).detail("MachineID", tcMachineInfo->machineID.contents().toString()).detail("Servers", tcMachineInfo->getServersIDStr()); ASSERT(!tcMachineInfo->serversOnMachine.empty()); LocalityEntry process = tcMachineInfo->localityEntry; forcedAttributes.push_back(process); @@ -1384,9 +1385,13 @@ struct DDTeamCollection : ReferenceCounted { // that have the least-utilized server team.clear(); auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team); - if (!success) { + if (!success && configuration.storageTeamSize > 1) { // NOTE: selectReplicas() returns false always when storageTeamSize == 1 + TraceEvent("MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize); break; } + if ( !success && configuration.storageTeamSize == 1 && forcedAttributes.size() > 0 ) { + TraceEvent(SevError, "MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize).detail("Success", success); + } ASSERT(forcedAttributes.size() > 0); team.push_back((UID*)machineLocalityMap.getObject(forcedAttributes[0])); @@ -2043,7 +2048,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) - .detail("DoBuildTeams", doBuildTeams) + .detail("DoBuildTeams", self->doBuildTeams) .trackLatest("TeamCollectionInfo"); } } @@ -3215,7 +3220,9 @@ ACTOR Future storageServerTracker( //Restart the storeTracker for the new interface storeTracker = keyValueStoreTypeTracker(self, server); hasWrongStoreTypeOrDC = false; + self->doBuildTeams = true; self->restartTeamBuilder.trigger(); + self->traceTeamCollectionInfo(); if(restartRecruiting) self->restartRecruiting.trigger(); } From 2993a96de8f325f34381cc01bfe872a8d40c2123 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 14:15:51 -0700 Subject: [PATCH 023/136] TeamCollectionInfo: Remove debug trace and apply clang format --- fdbserver/DataDistribution.actor.cpp | 44 +++++++--------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index b93aecc76f..58b62832d1 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1367,7 +1367,6 @@ struct DDTeamCollection : ReferenceCounted { if (leastUsedMachines.size()) { // Randomly choose 1 least used machine Reference tcMachineInfo = g_random->randomChoice(leastUsedMachines); - TraceEvent("MXDEBUG", distributorId).detail("MachineID", tcMachineInfo->machineID.contents().toString()).detail("Servers", tcMachineInfo->getServersIDStr()); ASSERT(!tcMachineInfo->serversOnMachine.empty()); LocalityEntry process = tcMachineInfo->localityEntry; forcedAttributes.push_back(process); @@ -1385,13 +1384,10 @@ struct DDTeamCollection : ReferenceCounted { // that have the least-utilized server team.clear(); auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team); - if (!success && configuration.storageTeamSize > 1) { // NOTE: selectReplicas() returns false always when storageTeamSize == 1 - TraceEvent("MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize); + // NOTE: selectReplicas() returns false always when storageTeamSize == 1 + if (!success && configuration.storageTeamSize > 1) { break; } - if ( !success && configuration.storageTeamSize == 1 && forcedAttributes.size() > 0 ) { - TraceEvent(SevError, "MXDEBUG", distributorId).detail("TeamSize", configuration.storageTeamSize).detail("Success", success); - } ASSERT(forcedAttributes.size() > 0); team.push_back((UID*)machineLocalityMap.getObject(forcedAttributes[0])); @@ -1442,8 +1438,8 @@ struct DDTeamCollection : ReferenceCounted { addMachineTeam(machines); addedMachineTeams++; - // Update the remaining machine team budget because the budget may decrease by any value between 1 and - // storageTeamSize + // Update the remaining machine team budget because the budget may decrease by + // any value between 1 and storageTeamSize remainingMachineTeamBudget = getRemainingMachineTeamBudget(); } else { TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId) @@ -1674,26 +1670,6 @@ struct DDTeamCollection : ReferenceCounted { return std::pair, int>(retMT, minNumProcessTeams); } - // Find the machine team with the largest number of server teams - std::pair, int> getMachineTeamWithMostProcessTeams() { - Reference retMT; - int maxNumProcessTeams = std::numeric_limits::min(); - - for (auto& mt : machineTeams) { - if (EXPENSIVE_VALIDATION) { - ASSERT(isServerTeamNumberCorrect(mt)); - } - int size = mt->serverTeams.size(); - if ( size > maxNumProcessTeams) { - maxNumProcessTeams = mt->serverTeams.size(); - retMT = mt; - } - } - - return std::pair, int>(retMT, maxNumProcessTeams); - } - - int getHealthyMachineTeamCount() { int healthyTeamCount = 0; for (auto mt = machineTeams.begin(); mt != machineTeams.end(); ++mt) { @@ -1889,7 +1865,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) - .detail("DoBuildTeams", doBuildTeams) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); return addedTeams; @@ -1926,7 +1902,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) - .detail("DoBuildTeams", doBuildTeams) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); // Debug purpose @@ -2048,7 +2024,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) - .detail("DoBuildTeams", self->doBuildTeams) + .detail("DoBuildTeams", self->doBuildTeams) .trackLatest("TeamCollectionInfo"); } } @@ -2448,8 +2424,10 @@ ACTOR Future teamRemover(DDTeamCollection* self) { // Check if a server will have 0 team after the team is removed for (auto& s : team->getServers()) { - if ( s->teams.size() == 0 ) { - TraceEvent(SevError, "TeamRemoverTooAggressive").detail("Server", s->id).detail("Team", team->getServerIDsStr()); + if (s->teams.size() == 0) { + TraceEvent(SevError, "TeamRemoverTooAggressive") + .detail("Server", s->id) + .detail("Team", team->getServerIDsStr()); self->traceAllInfo(true); } } From 8d5e8488081940fd4d4c4031fd0ee45a3ee4dc9f Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 14:22:41 -0700 Subject: [PATCH 024/136] QuitDatabase test: Check each server has at least 1 team --- fdbserver/QuietDatabase.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 72bda5cba6..4245495f07 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -308,7 +308,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 || minServerTeamOnServer <= 0 ) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) From 52efcfd136f0561469a6d04bb320b02343a689bc Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 27 Jun 2019 15:15:05 -0700 Subject: [PATCH 025/136] fix: properly create the right number for txsTags when changing between different numbers of logs --- fdbserver/TagPartitionedLogSystem.actor.cpp | 134 ++++++++++++-------- fdbserver/masterserver.actor.cpp | 6 - flow/ProtocolVersion.h | 2 +- 3 files changed, 83 insertions(+), 59 deletions(-) diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index eafb2e9554..2abd10c8c5 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -1764,8 +1764,37 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted localTags = getLocalTags(remoteLocality, allTags); LogSystemConfig oldLogSystemConfig = oldLogSystem->getLogSystemConfig(); + logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() ); + logSet->logServers.resize( remoteWorkers.remoteTLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size + logSet->updateLocalitySet(localities); + state vector> remoteTLogInitializationReplies; vector< InitializeTLogRequest > remoteTLogReqs( remoteWorkers.remoteTLogs.size() ); + + if(oldLogSystem->logRouterTags == 0) { + std::vector locations; + for( Tag tag : localTags ) { + locations.clear(); + logSet->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + remoteTLogReqs[ loc ].recoverTags.push_back( tag ); + } + + if(oldLogSystem->tLogs.size()) { + for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); + locations.clear(); + logSet->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + remoteTLogReqs[ loc ].recoverTags.push_back( tag ); + } + for(int i = 0; i < self->tLogs[0]->logServers.size(); i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } + localTags.push_back(txsTag); + } + } + for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ ) { InitializeTLogRequest &req = remoteTLogReqs[i]; req.recruitmentID = self->recruitmentID; @@ -1785,20 +1814,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->logServers.size(); } - logSet->tLogLocalities.resize( remoteWorkers.remoteTLogs.size() ); - logSet->logServers.resize( remoteWorkers.remoteTLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size - logSet->updateLocalitySet(localities); - - if(oldLogSystem->logRouterTags == 0) { - std::vector locations; - for( Tag tag : localTags ) { - locations.clear(); - logSet->getPushLocations( vector(1, tag), locations, 0 ); - for(int loc : locations) - remoteTLogReqs[ loc ].recoverTags.push_back( tag ); - } - } - for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ ) remoteTLogInitializationReplies.push_back( transformErrors( throwErrorOr( remoteWorkers.remoteTLogs[i].tLog.getReplyUnlessFailedFor( remoteTLogReqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) ); @@ -1940,6 +1955,36 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> initializationReplies; vector< InitializeTLogRequest > reqs( recr.tLogs.size() ); + + logSystem->tLogs[0]->tLogLocalities.resize( recr.tLogs.size() ); + logSystem->tLogs[0]->logServers.resize( recr.tLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size + logSystem->tLogs[0]->updateLocalitySet(localities); + + std::vector locations; + for( Tag tag : localTags ) { + locations.clear(); + logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + reqs[ loc ].recoverTags.push_back( tag ); + } + for(int i = 0; i < oldLogSystem->logRouterTags; i++) { + Tag tag = Tag(tagLocalityLogRouter, i); + reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag ); + } + if(oldLogSystem->tLogs.size()) { + for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); + locations.clear(); + logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + reqs[ loc ].recoverTags.push_back( tag ); + } + for(int i = 0; i < recr.tLogs.size(); i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } + localTags.push_back(txsTag); + } + for( int i = 0; i < recr.tLogs.size(); i++ ) { InitializeTLogRequest &req = reqs[i]; req.recruitmentID = logSystem->recruitmentID; @@ -1959,36 +2004,37 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->tLogLocalities.resize( recr.tLogs.size() ); - logSystem->tLogs[0]->logServers.resize( recr.tLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size - logSystem->tLogs[0]->updateLocalitySet(localities); - - for(int i = 0; i < oldLogSystem->logRouterTags; i++) { - Tag tag = Tag(tagLocalityLogRouter, i); - reqs[ logSystem->tLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag ); - } - std::vector locations; - for( Tag tag : localTags ) { - locations.clear(); - logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); - for(int loc : locations) - reqs[ loc ].recoverTags.push_back( tag ); - } - for( int i = 0; i < recr.tLogs.size(); i++ ) initializationReplies.push_back( transformErrors( throwErrorOr( recr.tLogs[i].tLog.getReplyUnlessFailedFor( reqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) ); state std::vector> recoveryComplete; if(region.satelliteTLogReplicationFactor > 0) { - std::vector satelliteTags; - for(int i = 0; i < recr.tLogs.size(); i++) { - satelliteTags.push_back(Tag(tagLocalityTxs, i)); - } - satelliteTags.push_back(txsTag); - state vector> satelliteInitializationReplies; vector< InitializeTLogRequest > sreqs( recr.satelliteTLogs.size() ); + std::vector satelliteTags; + + for(int i = 0; i < oldLogSystem->logRouterTags; i++) { + Tag tag = Tag(tagLocalityLogRouter, i); + locations.clear(); + logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + sreqs[ loc ].recoverTags.push_back( tag ); + } + if(oldLogSystem->tLogs.size()) { + for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); + locations.clear(); + logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); + for(int loc : locations) + sreqs[ loc ].recoverTags.push_back( tag ); + } + for(int i = 0; i < recr.tLogs.size(); i++) { + satelliteTags.push_back(Tag(tagLocalityTxs, i)); + } + satelliteTags.push_back(txsTag); + } + for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) { InitializeTLogRequest &req = sreqs[i]; req.recruitmentID = logSystem->recruitmentID; @@ -2008,22 +2054,6 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterTags; i++) { - Tag tag = i == -1 ? txsTag : Tag(tagLocalityLogRouter, i); - locations.clear(); - logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); - for(int loc : locations) - sreqs[ loc ].recoverTags.push_back( tag ); - } - - for(int i = 0; i < recr.tLogs.size(); i++) { - Tag tag = Tag(tagLocalityTxs, i); - locations.clear(); - logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); - for(int loc : locations) - sreqs[ loc ].recoverTags.push_back( tag ); - } - for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) satelliteInitializationReplies.push_back( transformErrors( throwErrorOr( recr.satelliteTLogs[i].tLog.getReplyUnlessFailedFor( sreqs[i], SERVER_KNOBS->TLOG_TIMEOUT, SERVER_KNOBS->MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ), master_recovery_failed() ) ); diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 55de67106a..abb001d95e 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -675,12 +675,6 @@ ACTOR Future readTransactionSystemState( Reference self, Refer Standalone> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) ); self->allTags.clear(); - if(self->lastEpochEnd > 0) { - for(int i = 0; i < oldLogSystem->getLogSystemConfig().tLogs[0].tLogs.size(); i++) { - self->allTags.push_back(Tag(tagLocalityTxs, i)); - } - self->allTags.push_back(txsTag); - } if(self->forceRecovery) { self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index 95842aae94..f236a4fabf 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -85,7 +85,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B061020000LL, EndpointAddrList); PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, IPv6); PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, TLogVersion); - PROTOCOL_VERSION_FEATURE(0x0FDB00B061060000LL, PseudoLocalities); + PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, PseudoLocalities); }; // These impact both communications and the deserialization of certain database and IKeyValueStore keys. From ee41311a54b44163ea721c0ac8e54787c576af8e Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 15:06:17 -0700 Subject: [PATCH 026/136] TeamCollection:Call addTeamsBestOf when remainingTeamBudget is not 0 --- fdbserver/DataDistribution.actor.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 58b62832d1..6316c2cc41 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1727,7 +1727,7 @@ struct DDTeamCollection : ReferenceCounted { // When it reaches the threshold, first try to build a server team with existing machine teams; if failed, // build an extra machine team and record the event in trace int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) { - ASSERT(teamsToBuild > 0); + ASSERT(teamsToBuild >= 0); ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0); int addedMachineTeams = 0; @@ -1852,6 +1852,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("Primary", primary) .detail("AddedTeamNumber", addedTeams) .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("RemainingTeamBudget", remainingTeamBudget) .detail("CurrentTeamNumber", teams.size()) .detail("DesiredTeamNumber", desiredTeamNumber) .detail("MaxTeamNumber", maxTeamNumber) @@ -1889,6 +1890,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("Primary", primary) .detail("AddedTeamNumber", 0) .detail("AimToBuildTeamNumber", 0) + .detail("RemainingTeamBudget", 0) .detail("CurrentTeamNumber", teams.size()) .detail("DesiredTeamNumber", desiredServerTeams) .detail("MaxTeamNumber", maxServerTeams) @@ -1965,7 +1967,7 @@ struct DDTeamCollection : ReferenceCounted { // teamsToBuild is calculated such that we will not build too many teams in the situation // when all (or most of) teams become unhealthy temporarily and then healthy again - state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount); + state int teamsToBuild = std::max(0, std::min(desiredTeams - teamCount, maxTeams - totalTeamCount)); TraceEvent("BuildTeamsBegin", self->distributorId) .detail("TeamsToBuild", teamsToBuild) @@ -1982,7 +1984,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("MachineCount", self->machine_info.size()) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER); - if (teamsToBuild > 0) { + if (teamsToBuild > 0 || remainingTeamBudget > 0) { state vector> builtTeams; // addTeamsBestOf() will not add more teams than needed. @@ -2011,6 +2013,7 @@ struct DDTeamCollection : ReferenceCounted { .detail("Primary", self->primary) .detail("AddedTeamNumber", 0) .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("RemainingTeamBudget", remainingTeamBudget) .detail("CurrentTeamNumber", self->teams.size()) .detail("DesiredTeamNumber", desiredTeams) .detail("MaxTeamNumber", maxTeams) From 42620e4831d3af5ac489578e2a8563d0fb8c3dac Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 16:52:36 -0700 Subject: [PATCH 027/136] TeamCollectionTest:GetTeamCollectionValid wait until values are correct --- fdbserver/DataDistribution.actor.cpp | 5 +++-- fdbserver/QuietDatabase.actor.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 6316c2cc41..28da690d09 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3190,7 +3190,7 @@ ACTOR Future storageServerTracker( self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); // The team number changes, so we need to update the team number info - self->traceTeamCollectionInfo(); + //self->traceTeamCollectionInfo(); } } @@ -3458,7 +3458,6 @@ ACTOR Future dataDistributionTeamCollection( self->redundantTeamRemover = teamRemover(self); self->addActor.send(self->redundantTeamRemover); } - self->traceTeamCollectionInfo(); if(self->includedDCs.size()) { //start this actor before any potential recruitments can happen @@ -3472,6 +3471,8 @@ ACTOR Future dataDistributionTeamCollection( self->addActor.send(monitorHealthyTeams( self )); self->addActor.send(waitHealthyZoneChange( self )); + self->traceTeamCollectionInfo(); + // SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them loop choose { diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 4245495f07..f5868713f1 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -323,7 +323,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); - return false; + wait(delay(10.0)); } else { return true; } From bc3e83363409e1bef67426cb5c1c3e31b18d9d4e Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 16:53:01 -0700 Subject: [PATCH 028/136] TeamCollection: Add release note --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 2e07e729ae..6ca2490060 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -14,6 +14,7 @@ Fixes ----- * The ``fdbrestore`` commands ``abort``, ``wait``, and ``status`` would use a default cluster file instead of the destination cluster file argument. `(PR #1701) `_ +* Ensure new added machines are used to build teams and host data from existing machines when a cluster is expanded. `(PR #1764) `_ 6.1.9 ===== From 4fe3c7f749890c3812e43cc4aed5ece00c70fcc1 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 17:09:21 -0700 Subject: [PATCH 029/136] TeamCollectionInfo:Revert to original version where it is --- fdbserver/DataDistribution.actor.cpp | 5 ++--- fdbserver/QuietDatabase.actor.cpp | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 28da690d09..6316c2cc41 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3190,7 +3190,7 @@ ACTOR Future storageServerTracker( self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); // The team number changes, so we need to update the team number info - //self->traceTeamCollectionInfo(); + self->traceTeamCollectionInfo(); } } @@ -3458,6 +3458,7 @@ ACTOR Future dataDistributionTeamCollection( self->redundantTeamRemover = teamRemover(self); self->addActor.send(self->redundantTeamRemover); } + self->traceTeamCollectionInfo(); if(self->includedDCs.size()) { //start this actor before any potential recruitments can happen @@ -3471,8 +3472,6 @@ ACTOR Future dataDistributionTeamCollection( self->addActor.send(monitorHealthyTeams( self )); self->addActor.send(waitHealthyZoneChange( self )); - self->traceTeamCollectionInfo(); - // SOMEDAY: Monitor FF/serverList for (new) servers that aren't in allServers and add or remove them loop choose { diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index f5868713f1..2c7d3c80f6 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -323,7 +323,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); - wait(delay(10.0)); + wait(delay(5.0)); } else { return true; } From bc4548e0d32754cd57011f67c51e78f9492d63df Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Thu, 27 Jun 2019 17:55:41 -0700 Subject: [PATCH 030/136] Fix sed accidentally rewriting a trace event to have an invalid field name. --- fdbrpc/sim2.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 044fc790db..a7ee2623e9 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1698,7 +1698,7 @@ void startNewSimulator(bool objSerializer) { } ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { - TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriority::DefaultDelay", TaskPriority::DefaultDelay); + TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay); wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question From f889843332853a83699a804457f09ab9086ea22d Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 18:24:18 -0700 Subject: [PATCH 031/136] Change traceTeamCollectionInfo to actor There are cases where traceTeamCollectionInfo was called within the same execution block, i.e., no wait between the two traceTeamCollectionInfo calls. Because simulation uses the same time for all execution instructions in the same execution block, having more than one traceTeamCollectionInfo at the same time will mess up the trackLatest semantics. When one of them is always chosen by simulator, simulation test will report false positive error. Changing this function to actor and adding a small delay inside the function can solve this problem. --- fdbserver/DataDistribution.actor.cpp | 48 ++++++++++++++++++---------- fdbserver/QuietDatabase.actor.cpp | 2 +- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 6316c2cc41..689c4650f2 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -535,6 +535,7 @@ Future storageServerTracker( Version const& addedVersion); Future teamTracker(struct DDTeamCollection* const& self, Reference const& team, bool const& badTeam, bool const& redundantTeam); +ACTOR static Future traceTeamCollectionInfo(DDTeamCollection* self); struct DDTeamCollection : ReferenceCounted { enum { REQUESTING_WORKER = 0, GETTING_WORKER = 1, GETTING_STORAGE = 2 }; @@ -958,7 +959,7 @@ struct DDTeamCollection : ReferenceCounted { } // Trace and record the current number of teams for correctness test - self->traceTeamCollectionInfo(); + wait( self->traceTeamCollectionInfo(self) ); return Void(); } @@ -1873,29 +1874,29 @@ struct DDTeamCollection : ReferenceCounted { } // Check if the number of server (and machine teams) is larger than the maximum allowed number - void traceTeamCollectionInfo() { - int totalHealthyServerCount = calculateHealthyServerCount(); + ACTOR static Future traceTeamCollectionInfo(DDTeamCollection* self) { + int totalHealthyServerCount = self->calculateHealthyServerCount(); int desiredServerTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyServerCount; int maxServerTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyServerCount; - int totalHealthyMachineCount = calculateHealthyMachineCount(); + int totalHealthyMachineCount = self->calculateHealthyMachineCount(); int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; - int healthyMachineTeamCount = getHealthyMachineTeamCount(); + int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); - std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); - std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + std::pair minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine(); - TraceEvent("TeamCollectionInfo", distributorId) - .detail("Primary", primary) + TraceEvent("TeamCollectionInfo", self->distributorId) + .detail("Primary", self->primary) .detail("AddedTeamNumber", 0) .detail("AimToBuildTeamNumber", 0) .detail("RemainingTeamBudget", 0) - .detail("CurrentTeamNumber", teams.size()) + .detail("CurrentTeamNumber", self->teams.size()) .detail("DesiredTeamNumber", desiredServerTeams) .detail("MaxTeamNumber", maxServerTeams) - .detail("StorageTeamSize", configuration.storageTeamSize) - .detail("CurrentMachineTeamNumber", machineTeams.size()) + .detail("StorageTeamSize", self->configuration.storageTeamSize) + .detail("CurrentMachineTeamNumber", self->machineTeams.size()) .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) @@ -1904,15 +1905,20 @@ struct DDTeamCollection : ReferenceCounted { .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) - .detail("DoBuildTeams", doBuildTeams) + .detail("DoBuildTeams", self->doBuildTeams) .trackLatest("TeamCollectionInfo"); + // Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise + // simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams + wait( delay(0.01) ); + // Debug purpose // if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) { // // When the number of machine teams is over the limit, print out the current team info. // traceAllInfo(true); // } + return Void(); } // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. @@ -2474,7 +2480,7 @@ ACTOR Future teamRemover(DDTeamCollection* self) { .detail("CurrentMachineTeamNumber", self->machineTeams.size()) .detail("DesiredMachineTeam", desiredMachineTeams) .detail("NumMachineTeamRemoved", numMachineTeamRemoved); - self->traceTeamCollectionInfo(); + wait( self->traceTeamCollectionInfo(self) ); } } } @@ -3087,6 +3093,7 @@ ACTOR Future storageServerTracker( } lastIsUnhealthy = status.isUnhealthy(); + state bool recordTeamCollectionInfo = false; choose { when( wait( failureTracker ) ) { // The server is failed AND all data has been removed from it, so permanently remove it. @@ -3190,7 +3197,8 @@ ACTOR Future storageServerTracker( self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); // The team number changes, so we need to update the team number info - self->traceTeamCollectionInfo(); + // wait( traceTeamCollectionInfo(self) ); + recordTeamCollectionInfo = true; } } @@ -3198,12 +3206,14 @@ ACTOR Future storageServerTracker( // We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality ); + // wait( traceTeamCollectionInfo(self) ); + recordTeamCollectionInfo = true; //Restart the storeTracker for the new interface storeTracker = keyValueStoreTypeTracker(self, server); hasWrongStoreTypeOrDC = false; self->doBuildTeams = true; self->restartTeamBuilder.trigger(); - self->traceTeamCollectionInfo(); + if(restartRecruiting) self->restartRecruiting.trigger(); } @@ -3224,6 +3234,10 @@ ACTOR Future storageServerTracker( server->wakeUpTracker = Promise(); } } + + if ( recordTeamCollectionInfo ) { + wait( self->traceTeamCollectionInfo(self) ); + } } } catch( Error &e ) { if (e.code() != error_code_actor_cancelled && errorOut.canBeSet()) @@ -3458,7 +3472,7 @@ ACTOR Future dataDistributionTeamCollection( self->redundantTeamRemover = teamRemover(self); self->addActor.send(self->redundantTeamRemover); } - self->traceTeamCollectionInfo(); + wait( self->traceTeamCollectionInfo(self) ); if(self->includedDCs.size()) { //start this actor before any potential recruitments can happen diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 2c7d3c80f6..4245495f07 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -323,7 +323,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); - wait(delay(5.0)); + return false; } else { return true; } From ce7eb10cacc1d07875d473e22bde0c09249d0adb Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 19:04:22 -0700 Subject: [PATCH 032/136] TeamCollectionInfo: Only count team number for healthy server and machine --- fdbserver/DataDistribution.actor.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 689c4650f2..5a848b4714 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1610,6 +1610,9 @@ struct DDTeamCollection : ReferenceCounted { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& server : server_info) { + if ( server_status.get(server.first).isUnhealthy() ) { + continue; + } if (server.second->teams.size() < minTeamNumber) { minTeamNumber = server.second->teams.size(); } @@ -1624,6 +1627,9 @@ struct DDTeamCollection : ReferenceCounted { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& machine : machine_info) { + if ( !isMachineHealthy(machine.second) ) { + continue; + } if (machine.second->machineTeams.size() < minTeamNumber) { minTeamNumber = machine.second->machineTeams.size(); } From 4da345f7d2e1eeb1d7ebfabac6b4b3c4639cbea0 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 27 Jun 2019 19:05:10 -0700 Subject: [PATCH 033/136] TeamCollectionTest:Remove test on minTeamOnServer --- fdbserver/QuietDatabase.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 4245495f07..886f7ad099 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -308,7 +308,7 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 || minServerTeamOnServer <= 0 ) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || minMachineTeamOnMachine <= 0 ) { TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) From 235697f688a31dc7bcb5da5c67aade1bc9efa6d7 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 27 Jun 2019 23:18:26 -0700 Subject: [PATCH 034/136] fix: txsTags are not popped at the recovery version --- fdbserver/OldTLogServer_6_0.actor.cpp | 2 +- fdbserver/TLogServer.actor.cpp | 2 +- fdbserver/workloads/LocalRatekeeper.actor.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 6e3034821e..8337977ca1 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -401,7 +401,7 @@ struct LogData : NonCopyable, public ReferenceCounted { //only callable after getTagData returns a null reference Reference createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) { - if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) { + if(tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) { popped = recoveredAt + 1; } Reference newTagData = Reference( new TagData(tag, popped, nothingPersistent, poppedRecently, unpoppedRecovered) ); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index e34052a0a4..83a89cb9b8 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -458,7 +458,7 @@ struct LogData : NonCopyable, public ReferenceCounted { //only callable after getTagData returns a null reference Reference createTagData(Tag tag, Version popped, bool nothingPersistent, bool poppedRecently, bool unpoppedRecovered) { - if(tag.locality != tagLocalityLogRouter && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) { + if(tag.locality != tagLocalityLogRouter && tag.locality != tagLocalityTxs && allTags.size() && !allTags.count(tag) && popped <= recoveredAt) { popped = recoveredAt + 1; } Reference newTagData = Reference( new TagData(tag, popped, 0, nothingPersistent, poppedRecently, unpoppedRecovered) ); diff --git a/fdbserver/workloads/LocalRatekeeper.actor.cpp b/fdbserver/workloads/LocalRatekeeper.actor.cpp index 95c7eea701..53c7f339f0 100644 --- a/fdbserver/workloads/LocalRatekeeper.actor.cpp +++ b/fdbserver/workloads/LocalRatekeeper.actor.cpp @@ -61,7 +61,7 @@ struct LocalRatekeeperWorkload : TestWorkload { state std::vector> requests; requests.reserve(100); loop { - state StorageQueuingMetricsReply metrics = wait(ssi.getQueuingMetrics.getReply(StorageQueuingMetricsRequest{})); + state StorageQueuingMetricsReply metrics = wait(brokenPromiseToNever(ssi.getQueuingMetrics.getReply(StorageQueuingMetricsRequest{}))); auto durabilityLag = metrics.version - metrics.durableVersion; double expectedRateLimit = 1.0; if (durabilityLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) { From 2113d6d01e1ec5f1f9a2611a864f18c6d2f60251 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 27 Jun 2019 23:39:19 -0700 Subject: [PATCH 035/136] fix: peek all possible txsTags which could have been used by old log sets --- fdbserver/TagPartitionedLogSystem.actor.cpp | 33 ++++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2abd10c8c5..b47f32586d 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -746,9 +746,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogServers.size(); + for(auto& it : oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); + } + if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { std::vector< Reference > cursors; - for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + for(int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } //SOMEDAY: remove once upgrades from 6.2 are no longer supported @@ -760,7 +765,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted= end) { std::vector< Reference > cursors; - for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + for(int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, end, true, peekLocality)); } //SOMEDAY: remove once upgrades from 6.2 are no longer supported @@ -776,7 +781,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted > localCursors; std::vector< Reference > allCursors; - for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + for(int i = 0; i < maxTxsTags; i++) { localCursors.push_back(peekLocal(dbgid, Tag(tagLocalityTxs, i), begin, localEnd, true, peekLocality)); allCursors.push_back(peekAll(dbgid, localEnd, end, Tag(tagLocalityTxs, i), true)); } @@ -792,7 +797,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted > cursors; - for(int i = 0; i < tLogs[0]->logServers.size(); i++) { + for(int i = 0; i < maxTxsTags; i++) { cursors.push_back(peekAll(dbgid, begin, end, Tag(tagLocalityTxs, i), true)); } //SOMEDAY: remove once upgrades from 6.2 are no longer supported @@ -1781,7 +1786,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { - for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + int maxTxsTags = oldLogSystem->tLogs[0]->logServers.size(); + for(auto& it : oldLogSystem->oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); + } + for(int i = -1; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSet->getPushLocations( vector(1, tag), locations, 0 ); @@ -1867,6 +1876,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { + maxTxsTags = oldLogSystem->tLogs[0]->logServers.size(); + for(auto& it : oldLogSystem->oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); + } + } + if(region.satelliteTLogReplicationFactor > 0) { logSystem->tLogs.emplace_back(new LogSet()); if(recr.satelliteFallback) { @@ -1891,7 +1908,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[1]->logServers.resize( recr.satelliteTLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); - logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),oldLogSystem->tLogs.size() ? oldLogSystem->tLogs[0]->logServers.size() : 0); + logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),maxTxsTags); logSystem->expectedLogSets++; } @@ -1972,7 +1989,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag ); } if(oldLogSystem->tLogs.size()) { - for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + for(int i = -1; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); @@ -2022,7 +2039,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { - for(int i = -1; i < oldLogSystem->tLogs[0]->logServers.size(); i++) { + for(int i = -1; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); From cb681693df715cc757cdf02e6792baae082f1628 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Fri, 28 Jun 2019 09:50:40 -0700 Subject: [PATCH 036/136] TeamCollection:Do NOT consider healthyness in counting team number If a team is removed from DD, it will be marked as failed and eventually removed from the global teams data structure. Team healthyness is likely to be a temporary state which can be changed rather quickly. --- fdbserver/DataDistribution.actor.cpp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 5a848b4714..11ce0063ff 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -1510,12 +1510,7 @@ struct DDTeamCollection : ReferenceCounted { // Only pick healthy server, which is not failed or excluded. if (server_status.get(server.first).isUnhealthy()) continue; - int numTeams = 0; - for (auto& t : server.second->teams) { - if (!t->isWrongConfiguration() && t->isHealthy()) { - ++numTeams; - } - } + int numTeams = server.second->teams.size(); if (numTeams < minTeamNumber) { minTeamNumber = numTeams; leastUsedServers.clear(); @@ -1610,9 +1605,9 @@ struct DDTeamCollection : ReferenceCounted { uint32_t minTeamNumber = std::numeric_limits::max(); uint32_t maxTeamNumber = std::numeric_limits::min(); for (auto& server : server_info) { - if ( server_status.get(server.first).isUnhealthy() ) { - continue; - } + // if ( server_status.get(server.first).isUnhealthy() ) { + // continue; + // } if (server.second->teams.size() < minTeamNumber) { minTeamNumber = server.second->teams.size(); } @@ -1623,13 +1618,13 @@ struct DDTeamCollection : ReferenceCounted { return std::make_pair(minTeamNumber, maxTeamNumber); } - std::pair calculateMinMaxMachineTeamNumOnMachine() { - uint32_t minTeamNumber = std::numeric_limits::max(); - uint32_t maxTeamNumber = std::numeric_limits::min(); + std::pair calculateMinMaxMachineTeamNumOnMachine() { + int minTeamNumber = std::numeric_limits::max(); + int maxTeamNumber = 0; for (auto& machine : machine_info) { - if ( !isMachineHealthy(machine.second) ) { - continue; - } + // if ( !isMachineHealthy(machine.second) ) { + // continue; + // } if (machine.second->machineTeams.size() < minTeamNumber) { minTeamNumber = machine.second->machineTeams.size(); } From 7f4586ad497105f9f67b01bdb9de89475ffa889d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 28 Jun 2019 12:33:24 -0700 Subject: [PATCH 037/136] the number of txsTags needs to be tracked separately from the number of transaction logs because of forced recoveries --- fdbserver/DBCoreState.h | 14 ++- fdbserver/LogSystemConfig.h | 14 ++- fdbserver/TagPartitionedLogSystem.actor.cpp | 115 +++++++++++++------- flow/ProtocolVersion.h | 1 + 4 files changed, 90 insertions(+), 54 deletions(-) diff --git a/fdbserver/DBCoreState.h b/fdbserver/DBCoreState.h index cef22c0b60..b5006d2e72 100644 --- a/fdbserver/DBCoreState.h +++ b/fdbserver/DBCoreState.h @@ -76,14 +76,15 @@ struct CoreTLogSet { struct OldTLogCoreData { std::vector tLogs; int32_t logRouterTags; + int32_t txsTags; Version epochEnd; std::set pseudoLocalities; - OldTLogCoreData() : epochEnd(0), logRouterTags(0) {} + OldTLogCoreData() : epochEnd(0), logRouterTags(0), txsTags(0) {} explicit OldTLogCoreData(const OldLogData&); bool operator == (OldTLogCoreData const& rhs) const { - return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities; + return tLogs == rhs.tLogs && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && epochEnd == rhs.epochEnd && pseudoLocalities == rhs.pseudoLocalities; } template @@ -97,7 +98,7 @@ struct OldTLogCoreData { tLogs[0].tLogVersion = TLogVersion::V2; } if (ar.protocolVersion().hasPseudoLocalities()) { - serializer(ar, pseudoLocalities); + serializer(ar, pseudoLocalities, txsTags); } } }; @@ -105,12 +106,13 @@ struct OldTLogCoreData { struct DBCoreState { std::vector tLogs; int32_t logRouterTags; + int32_t txsTags; std::vector oldTLogData; DBRecoveryCount recoveryCount; // Increases with sequential successful recoveries. LogSystemType logSystemType; std::set pseudoLocalities; - DBCoreState() : logRouterTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {} + DBCoreState() : logRouterTags(0), txsTags(0), recoveryCount(0), logSystemType(LogSystemType::empty) {} vector getPriorCommittedLogServers() { vector priorCommittedLogServers; @@ -130,7 +132,7 @@ struct DBCoreState { } bool isEqual(DBCoreState const& r) const { - return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && pseudoLocalities == r.pseudoLocalities; + return logSystemType == r.logSystemType && recoveryCount == r.recoveryCount && tLogs == r.tLogs && oldTLogData == r.oldTLogData && logRouterTags == r.logRouterTags && txsTags == r.txsTags && pseudoLocalities == r.pseudoLocalities; } bool operator == ( const DBCoreState& rhs ) const { return isEqual(rhs); } @@ -146,7 +148,7 @@ struct DBCoreState { if(ar.protocolVersion().hasTagLocality()) { serializer(ar, tLogs, logRouterTags, oldTLogData, recoveryCount, logSystemType); if (ar.protocolVersion().hasPseudoLocalities()) { - serializer(ar, pseudoLocalities); + serializer(ar, pseudoLocalities, txsTags); } } else if(ar.isDeserializing) { tLogs.push_back(CoreTLogSet()); diff --git a/fdbserver/LogSystemConfig.h b/fdbserver/LogSystemConfig.h index b1947d1457..c7b0a592d2 100644 --- a/fdbserver/LogSystemConfig.h +++ b/fdbserver/LogSystemConfig.h @@ -157,9 +157,10 @@ struct OldTLogConf { std::vector tLogs; Version epochEnd; int32_t logRouterTags; + int32_t txsTags; std::set pseudoLocalities; - OldTLogConf() : epochEnd(0), logRouterTags(0) {} + OldTLogConf() : epochEnd(0), logRouterTags(0), txsTags(0) {} explicit OldTLogConf(const OldLogData&); std::string toString() const { @@ -167,7 +168,7 @@ struct OldTLogConf { } bool operator == ( const OldTLogConf& rhs ) const { - return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && pseudoLocalities == rhs.pseudoLocalities; + return tLogs == rhs.tLogs && epochEnd == rhs.epochEnd && logRouterTags == rhs.logRouterTags && txsTags == rhs.txsTags && pseudoLocalities == rhs.pseudoLocalities; } bool isEqualIds(OldTLogConf const& r) const { @@ -184,7 +185,7 @@ struct OldTLogConf { template void serialize( Ar& ar ) { - serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities); + serializer(ar, tLogs, epochEnd, logRouterTags, pseudoLocalities, txsTags); } }; @@ -199,6 +200,7 @@ struct LogSystemConfig { LogSystemType logSystemType; std::vector tLogs; int32_t logRouterTags; + int32_t txsTags; std::vector oldTLogs; int32_t expectedLogSets; UID recruitmentID; @@ -206,7 +208,7 @@ struct LogSystemConfig { Optional recoveredAt; std::set pseudoLocalities; - LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), expectedLogSets(0), stopped(false) {} + LogSystemConfig() : logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), stopped(false) {} std::string toString() const { return format("type: %d oldGenerations: %d tags: %d %s", logSystemType, oldTLogs.size(), logRouterTags, describe(tLogs).c_str()); @@ -327,7 +329,7 @@ struct LogSystemConfig { bool operator == ( const LogSystemConfig& rhs ) const { return isEqual(rhs); } bool isEqual(LogSystemConfig const& r) const { - return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities; + return logSystemType == r.logSystemType && tLogs == r.tLogs && oldTLogs == r.oldTLogs && expectedLogSets == r.expectedLogSets && logRouterTags == r.logRouterTags && txsTags == r.txsTags && recruitmentID == r.recruitmentID && stopped == r.stopped && recoveredAt == r.recoveredAt && pseudoLocalities == r.pseudoLocalities; } bool isEqualIds(LogSystemConfig const& r) const { @@ -358,7 +360,7 @@ struct LogSystemConfig { template void serialize( Ar& ar ) { - serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities); + serializer(ar, logSystemType, tLogs, logRouterTags, oldTLogs, expectedLogSets, recruitmentID, stopped, recoveredAt, pseudoLocalities, txsTags); } }; diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index b47f32586d..ede5d4bb7b 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -44,15 +44,16 @@ ACTOR Future minVersionWhenReady( Future f, std::vector> tLogs; int32_t logRouterTags; + int32_t txsTags; Version epochEnd; std::set pseudoLocalities; - OldLogData() : epochEnd(0), logRouterTags(0) {} + OldLogData() : epochEnd(0), logRouterTags(0), txsTags(0) {} // Constructor for T of OldTLogConf and OldTLogCoreData template explicit OldLogData(const T& conf) - : logRouterTags(conf.logRouterTags), epochEnd(conf.epochEnd), + : logRouterTags(conf.logRouterTags), txsTags(conf.txsTags), epochEnd(conf.epochEnd), pseudoLocalities(conf.pseudoLocalities) { tLogs.resize(conf.tLogs.size()); @@ -123,7 +124,7 @@ TLogSet::TLogSet(const LogSet& rhs) : } OldTLogConf::OldTLogConf(const OldLogData& oldLogData) : - logRouterTags(oldLogData.logRouterTags), epochEnd(oldLogData.epochEnd), + logRouterTags(oldLogData.logRouterTags), txsTags(oldLogData.txsTags), epochEnd(oldLogData.epochEnd), pseudoLocalities(oldLogData.pseudoLocalities) { for (const Reference& logSet : oldLogData.tLogs) { @@ -146,7 +147,7 @@ CoreTLogSet::CoreTLogSet(const LogSet& logset) : } OldTLogCoreData::OldTLogCoreData(const OldLogData& oldData) : - logRouterTags(oldData.logRouterTags), epochEnd(oldData.epochEnd), + logRouterTags(oldData.logRouterTags), txsTags(oldData.txsTags), epochEnd(oldData.epochEnd), pseudoLocalities(oldData.pseudoLocalities) { for (const Reference& logSet : oldData.tLogs) { @@ -162,6 +163,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> tLogs; int expectedLogSets; int logRouterTags; + int txsTags; UID recruitmentID; int repopulateRegionAntiQuorum; bool stopped; @@ -188,7 +190,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted oldLogData; AsyncTrigger logSystemConfigChanged; - TagPartitionedLogSystem( UID dbgid, LocalityData locality, Optional>> addActor = Optional>>() ) : dbgid(dbgid), locality(locality), addActor(addActor), popActors(false), recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), logSystemType(LogSystemType::empty), logRouterTags(0), expectedLogSets(0), hasRemoteServers(false), stopped(false), repopulateRegionAntiQuorum(0) {} + TagPartitionedLogSystem( UID dbgid, LocalityData locality, Optional>> addActor = Optional>>() ) : dbgid(dbgid), locality(locality), addActor(addActor), popActors(false), recoveryCompleteWrittenToCoreState(false), remoteLogsWrittenToCoreState(false), logSystemType(LogSystemType::empty), logRouterTags(0), txsTags(0), expectedLogSets(0), hasRemoteServers(false), stopped(false), repopulateRegionAntiQuorum(0) {} virtual void stopRejoins() { rejoins = Future(); @@ -264,6 +266,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.reserve(lsConf.tLogs.size()); logSystem->expectedLogSets = lsConf.expectedLogSets; logSystem->logRouterTags = lsConf.logRouterTags; + logSystem->txsTags = lsConf.txsTags; logSystem->recruitmentID = lsConf.recruitmentID; logSystem->stopped = lsConf.stopped; if(useRecoveredAt) { @@ -294,6 +297,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.emplace_back(new LogSet(tLogSet)); } logSystem->logRouterTags = lsConf.oldTLogs[0].logRouterTags; + logSystem->txsTags = lsConf.oldTLogs[0].txsTags; //logSystem->epochEnd = lsConf.oldTLogs[0].epochEnd; for (int i = 1; i < lsConf.oldTLogs.size(); i++ ) { @@ -316,6 +320,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogServers.size()) { @@ -746,10 +751,13 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogServers.size(); + int maxTxsTags = txsTags; + bool needsOldTxs = txsTags==0; for(auto& it : oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.txsTags==0; } + if(peekLocality < 0 || localEnd == invalidVersion || localEnd <= begin) { std::vector< Reference > cursors; @@ -757,7 +765,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } @@ -769,7 +779,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } @@ -786,8 +798,10 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::BufferedCursor(localCursors, begin, localEnd, false) ); cursors[0] = Reference( new ILogSystem::BufferedCursor(allCursors, localEnd, end, false) ); @@ -801,7 +815,9 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::BufferedCursor(cursors, begin, end, false) ); } @@ -952,13 +968,13 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogServers.size(); i++) { + if(txsTags == 0) { + pop(upTo, txsTag, 0, popLocality); + } else { + for(int i = 0; i < txsTags; i++) { pop(upTo, Tag(tagLocalityTxs, i), 0, popLocality); } } - //SOMEDAY: remove once upgrades from 6.2 are no longer supported - pop(upTo, txsTag, 0, popLocality); } virtual void pop( Version upTo, Tag tag, Version durableKnownCommittedVersion, int8_t popLocality ) { @@ -1086,6 +1102,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedrandomInt(0, tLogs[0]->logServers.size())); + if(txsTags==0) { + return txsTag; + } + return Tag(tagLocalityTxs, deterministicRandom()->randomInt(0, txsTags)); } ACTOR static Future monitorLog(Reference>> logServer, Reference> failed) { @@ -1529,6 +1548,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs = logServers; logSystem->logRouterTags = prevState.logRouterTags; + logSystem->txsTags = prevState.txsTags; logSystem->oldLogData = oldLogData; logSystem->logSystemType = prevState.logSystemType; logSystem->rejoins = rejoins; @@ -1786,21 +1806,26 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { - int maxTxsTags = oldLogSystem->tLogs[0]->logServers.size(); + int maxTxsTags = oldLogSystem->txsTags; + bool needsOldTxs = oldLogSystem->txsTags==0; for(auto& it : oldLogSystem->oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.txsTags==0; } - for(int i = -1; i < maxTxsTags; i++) { + for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSet->getPushLocations( vector(1, tag), locations, 0 ); for(int loc : locations) remoteTLogReqs[ loc ].recoverTags.push_back( tag ); } - for(int i = 0; i < self->tLogs[0]->logServers.size(); i++) { - localTags.push_back(Tag(tagLocalityTxs, i)); + if(self->txsTags == 0) { + localTags.push_back(txsTag); + } else { + for(int i = 0; i < self->txsTags; i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } } - localTags.push_back(txsTag); } } @@ -1820,7 +1845,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedstartVersion; req.logRouterTags = 0; - req.txsTags = self->tLogs[0]->logServers.size(); + req.txsTags = self->txsTags; } for( int i = 0; i < remoteWorkers.remoteTLogs.size(); i++ ) @@ -1858,6 +1883,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedrecoveredAt = oldLogSystem->recoverAt; logSystem->repopulateRegionAntiQuorum = configuration.repopulateRegionAntiQuorum; logSystem->recruitmentID = deterministicRandom()->randomUniqueID(); + logSystem->txsTags = recr.tLogs.size(); oldLogSystem->recruitmentID = logSystem->recruitmentID; if(configuration.usableRegions > 1) { @@ -1876,12 +1902,11 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { - maxTxsTags = oldLogSystem->tLogs[0]->logServers.size(); - for(auto& it : oldLogSystem->oldLogData) { - maxTxsTags = std::max(maxTxsTags, it.tLogs[0]->logServers.size()); - } + state int maxTxsTags = oldLogSystem->txsTags; + state bool needsOldTxs = oldLogSystem->txsTags==0; + for(auto& it : oldLogSystem->oldLogData) { + maxTxsTags = std::max(maxTxsTags, it.txsTags); + needsOldTxs = needsOldTxs || it.txsTags==0; } if(region.satelliteTLogReplicationFactor > 0) { @@ -1908,7 +1933,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[1]->logServers.resize( recr.satelliteTLogs.size() ); // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size logSystem->tLogs[1]->updateLocalitySet(logSystem->tLogs[1]->tLogLocalities); - logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,recr.tLogs.size(),maxTxsTags); + logSystem->tLogs[1]->populateSatelliteTagLocations(logSystem->logRouterTags,oldLogSystem->logRouterTags,logSystem->txsTags,maxTxsTags); logSystem->expectedLogSets++; } @@ -1989,17 +2014,20 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->bestLocationFor( tag ) ].recoverTags.push_back( tag ); } if(oldLogSystem->tLogs.size()) { - for(int i = -1; i < maxTxsTags; i++) { + for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); for(int loc : locations) reqs[ loc ].recoverTags.push_back( tag ); } - for(int i = 0; i < recr.tLogs.size(); i++) { - localTags.push_back(Tag(tagLocalityTxs, i)); + if(logSystem->txsTags == 0) { + localTags.push_back(txsTag); + } else { + for(int i = 0; i < logSystem->txsTags; i++) { + localTags.push_back(Tag(tagLocalityTxs, i)); + } } - localTags.push_back(txsTag); } for( int i = 0; i < recr.tLogs.size(); i++ ) { @@ -2018,7 +2046,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs[0]->startVersion; req.logRouterTags = logSystem->logRouterTags; - req.txsTags = recr.tLogs.size(); + req.txsTags = logSystem->txsTags; } for( int i = 0; i < recr.tLogs.size(); i++ ) @@ -2039,17 +2067,20 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtLogs.size()) { - for(int i = -1; i < maxTxsTags; i++) { + for(int i = needsOldTxs?-1:0; i < maxTxsTags; i++) { Tag tag = i==-1 ? txsTag : Tag(tagLocalityTxs, i); locations.clear(); logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); for(int loc : locations) sreqs[ loc ].recoverTags.push_back( tag ); } - for(int i = 0; i < recr.tLogs.size(); i++) { - satelliteTags.push_back(Tag(tagLocalityTxs, i)); + if(logSystem->txsTags == 0) { + satelliteTags.push_back(txsTag); + } else { + for(int i = 0; i < logSystem->txsTags; i++) { + satelliteTags.push_back(Tag(tagLocalityTxs, i)); + } } - satelliteTags.push_back(txsTag); } for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) { @@ -2068,7 +2099,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedknownCommittedVersion + 1; req.logRouterTags = logSystem->logRouterTags; - req.txsTags = recr.tLogs.size(); + req.txsTags = logSystem->txsTags; } for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) diff --git a/flow/ProtocolVersion.h b/flow/ProtocolVersion.h index f236a4fabf..e35b7f197b 100644 --- a/flow/ProtocolVersion.h +++ b/flow/ProtocolVersion.h @@ -86,6 +86,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, IPv6); PROTOCOL_VERSION_FEATURE(0x0FDB00B061030000LL, TLogVersion); PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, PseudoLocalities); + PROTOCOL_VERSION_FEATURE(0x0FDB00B061070000LL, ShardedTxsTags); }; // These impact both communications and the deserialization of certain database and IKeyValueStore keys. From 2035b362573107f44ab57f837eb25a6cb938f4c7 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 28 Jun 2019 13:24:32 -0700 Subject: [PATCH 038/136] Make default and persistent options specifyable via annotations to fdb.options. Fix some issues with persisting these options in the multi-version client. Make size limit option not persistent. --- bindings/go/src/fdb/generated.go | 28 ++++- bindings/python/tests/size_limit.py | 66 ++++++----- bindings/python/tests/tester.py | 4 + .../sphinx/source/api-common.rst.inc | 2 +- fdbclient/DatabaseContext.h | 6 +- fdbclient/FDBOptions.h | 37 +++++- fdbclient/MultiVersionTransaction.actor.cpp | 47 +++++++- fdbclient/MultiVersionTransaction.h | 8 +- fdbclient/NativeAPI.actor.cpp | 112 ++++++++---------- fdbclient/NativeAPI.actor.h | 2 + fdbclient/ReadYourWrites.actor.cpp | 65 +++++----- fdbclient/ReadYourWrites.h | 5 +- fdbclient/ThreadSafeTransaction.actor.cpp | 4 + fdbclient/vexillographer/cpp.cs | 4 +- fdbclient/vexillographer/fdb.options | 23 ++-- fdbclient/vexillographer/vexillographer.cs | 13 +- 16 files changed, 276 insertions(+), 150 deletions(-) diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index 2a7c40f6aa..782b108fda 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -228,6 +228,30 @@ func (o NetworkOptions) SetEnableSlowTaskProfiling() error { return o.setOpt(71, nil) } +// Enable client buggify - will make requests randomly fail (intended for client testing) +func (o NetworkOptions) SetClientBuggifyEnable() error { + return o.setOpt(80, nil) +} + +// Disable client buggify +func (o NetworkOptions) SetClientBuggifyDisable() error { + return o.setOpt(81, nil) +} + +// Set the probability of a CLIENT_BUGGIFY section being active for the current execution. +// +// Parameter: probability expressed as a percentage between 0 and 100 +func (o NetworkOptions) SetClientBuggifySectionActivatedProbability(param int64) error { + return o.setOpt(82, int64ToBytes(param)) +} + +// Set the probability of an active CLIENT_BUGGIFY section being fired. A section will only fire if it was activated +// +// Parameter: probability expressed as a percentage between 0 and 100 +func (o NetworkOptions) SetClientBuggifySectionFiredProbability(param int64) error { + return o.setOpt(83, int64ToBytes(param)) +} + // Set the size of the client location cache. Raising this value can boost performance in very large databases where clients access data in a near-random pattern. Defaults to 100000. // // Parameter: Max location cache entries @@ -277,7 +301,7 @@ func (o DatabaseOptions) SetTransactionMaxRetryDelay(param int64) error { return o.setOpt(502, int64ToBytes(param)) } -// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Default to 10,000,000 bytes. +// Set the maximum transaction size in bytes. This sets the ``size_limit`` option on each transaction created by this database. See the transaction option description for more information. // // Parameter: value in bytes func (o DatabaseOptions) SetTransactionSizeLimit(param int64) error { @@ -409,7 +433,7 @@ func (o TransactionOptions) SetMaxRetryDelay(param int64) error { return o.setOpt(502, int64ToBytes(param)) } -// Set the maximum transaction size which, if exceeded, will cause the transaction to be cancelled. Valid parameter values are ``[32, 10,000,000]```. +// Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit. // // Parameter: value in bytes func (o TransactionOptions) SetSizeLimit(param int64) error { diff --git a/bindings/python/tests/size_limit.py b/bindings/python/tests/size_limit.py index 6d08f15efc..3072e153f8 100644 --- a/bindings/python/tests/size_limit.py +++ b/bindings/python/tests/size_limit.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -# size_limit.py +# size_limit_tests.py # # This source file is part of the FoundationDB open source project # @@ -21,44 +21,56 @@ import fdb import sys -fdb.api_version(610) +if __name__ == '__main__': + fdb.api_version(610) @fdb.transactional def setValue(tr, key, value): - tr[key] = value + tr[key] = value @fdb.transactional def setValueWithLimit(tr, key, value, limit): - tr.options.set_size_limit(limit) - tr[key] = value + tr.options.set_size_limit(limit) + tr[key] = value -def run(clusterFile): - db = fdb.open(clusterFile) - db.options.set_transaction_timeout(2000) # 2 seconds - db.options.set_transaction_retry_limit(3) - value = 'a' * 1024 +def test_size_limit_option(db): + db.options.set_transaction_timeout(2000) # 2 seconds + db.options.set_transaction_retry_limit(3) + value = 'a' * 1024 - setValue(db, 't1', value) - assert(value == db['t1']) + setValue(db, 't1', value) + assert(value == db['t1']) - try: - db.options.set_transaction_size_limit(1000) - setValue(db, 't2', value) - assert(False) # not reached - except fdb.impl.FDBError as e: - assert(e.code == 2101) # Transaction exceeds byte limit (2101) + try: + db.options.set_transaction_size_limit(1000) + setValue(db, 't2', value) + assert(False) # not reached + except fdb.FDBError as e: + assert(e.code == 2101) # Transaction exceeds byte limit (2101) - # Per transaction option overrides database option - db.options.set_transaction_size_limit(1000000) - try: - setValueWithLimit(db, 't3', value, 1000) - assert(False) # not reached - except fdb.impl.FDBError as e: - assert(e.code == 2101) # Transaction exceeds byte limit (2101) + # Per transaction option overrides database option + db.options.set_transaction_size_limit(1000000) + try: + setValueWithLimit(db, 't3', value, 1000) + assert(False) # not reached + except fdb.FDBError as e: + assert(e.code == 2101) # Transaction exceeds byte limit (2101) + # DB default survives on_error reset + db.options.set_transaction_size_limit(1000) + tr = db.create_transaction() + try: + tr['t4'] = 'bar' + tr.on_error(fdb.FDBError(1007)).wait() + setValue(tr, 't4', value) + tr.commit().wait() + assert(False) # not reached + except fdb.FDBError as e: + assert(e.code == 2101) # Transaction exceeds byte limit (2101) # Expect a cluster file as input. This test will write to the FDB cluster, so # be aware of potential side effects. if __name__ == '__main__': - clusterFile = sys.argv[1] - run(clusterFile) \ No newline at end of file + clusterFile = sys.argv[1] + db = fdb.open(clusterFile) + test_size_limit_option(db) diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 3023cc5cb8..95aa36ea3e 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -48,6 +48,8 @@ from cancellation_timeout_tests import test_retry_limits from cancellation_timeout_tests import test_db_retry_limits from cancellation_timeout_tests import test_combinations +from size_limit_tests import test_size_limit_option + random.seed(0) if len(sys.argv) == 4: @@ -557,6 +559,8 @@ class Tester: test_locality(db) test_predicates() + test_size_limit_option(db) + except fdb.FDBError as e: print("Unit tests failed: %s" % e.description) traceback.print_exc() diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc index 292cd36ec5..3c99c45382 100644 --- a/documentation/sphinx/source/api-common.rst.inc +++ b/documentation/sphinx/source/api-common.rst.inc @@ -399,7 +399,7 @@ .. |option-set-size-limit-blurb| replace:: - Set the maximum transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit. The value set by this limit will persist across transaction resets. + Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit. .. |option-set-timeout-blurb1| replace:: diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index eca185e8f8..537d329fce 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -154,10 +154,6 @@ public: int outstandingWatches; int maxOutstandingWatches; - double transactionTimeout; - int transactionMaxRetries; - double transactionMaxBackoff; - int transactionMaxSize; // Max size in bytes. int snapshotRywEnabled; Future logger; @@ -180,6 +176,8 @@ public: HealthMetrics healthMetrics; double healthMetricsLastUpdated; double detailedHealthMetricsLastUpdated; + + UniqueOrderedOptionList transactionDefaults; }; #endif diff --git a/fdbclient/FDBOptions.h b/fdbclient/FDBOptions.h index e23cab582a..dc3a1d0075 100644 --- a/fdbclient/FDBOptions.h +++ b/fdbclient/FDBOptions.h @@ -23,8 +23,11 @@ #define FDBCLIENT_FDBOPTIONS_H #include +#include #include +#include "flow/Arena.h" + struct FDBOptionInfo { std::string name; std::string comment; @@ -32,9 +35,14 @@ struct FDBOptionInfo { bool hasParameter; bool hidden; + bool persistent; - FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden) - : name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden) { } + // If non-negative, this specifies the code for the transaction option that this option is the default value for. + int defaultFor; + + FDBOptionInfo(std::string name, std::string comment, std::string parameterComment, bool hasParameter, bool hidden, bool persistent, int defaultFor) + : name(name), comment(comment), parameterComment(parameterComment), hasParameter(hasParameter), hidden(hidden), persistent(persistent), + defaultFor(defaultFor) { } FDBOptionInfo() { } }; @@ -54,6 +62,29 @@ public: FDBOptionInfoMap() { T::init(); } }; -#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden); +template +class UniqueOrderedOptionList { +public: + typedef std::list>>> OptionList; + +private: + OptionList options; + std::map optionsIndexMap; + +public: + void addOption(typename T::Option option, Optional> value) { + auto itr = optionsIndexMap.find(option); + if(itr != optionsIndexMap.end()) { + options.erase(itr->second); + } + options.push_back(std::make_pair(option, value)); + optionsIndexMap[option] = --options.end(); + } + + typename OptionList::const_iterator begin() const { return options.cbegin(); } + typename OptionList::const_iterator end() const { return options.cend(); } +}; + +#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor); #endif \ No newline at end of file diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index c2ef5ad1b6..1f51dd3680 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -408,17 +408,36 @@ void DLApi::addNetworkThreadCompletionHook(void (*hook)(void*), void *hookParame } // MultiVersionTransaction -MultiVersionTransaction::MultiVersionTransaction(Reference db) : db(db) { +MultiVersionTransaction::MultiVersionTransaction(Reference db, UniqueOrderedOptionList defaultOptions) : db(db) { + setDefaultOptions(defaultOptions); updateTransaction(); } -// SOMEDAY: This function is unsafe if it's possible to set Database options that affect subsequently created transactions. There are currently no such options. +void MultiVersionTransaction::setDefaultOptions(UniqueOrderedOptionList options) { + MutexHolder holder(db->dbState->optionLock); + std::copy(options.begin(), options.end(), std::back_inserter(persistentOptions)); +} + void MultiVersionTransaction::updateTransaction() { auto currentDb = db->dbState->dbVar->get(); TransactionInfo newTr; if(currentDb.value) { newTr.transaction = currentDb.value->createTransaction(); + + Optional timeout; + for (auto option : persistentOptions) { + if(option.first == FDBTransactionOptions::TIMEOUT) { + timeout = option.second.castTo(); + } + else { + newTr.transaction->setOption(option.first, option.second.castTo()); + } + } + + if(timeout.present()) { + newTr.transaction->setOption(FDBTransactionOptions::TIMEOUT, timeout); + } } newTr.onChange = currentDb.onChange; @@ -574,6 +593,9 @@ Version MultiVersionTransaction::getCommittedVersion() { } void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional value) { + if(MultiVersionApi::apiVersionAtLeast(610) && FDBTransactionOptions::optionInfo[option].persistent) { + persistentOptions.push_back(std::make_pair(option, value.castTo>())); + } auto tr = getTransaction(); if(tr.transaction) { tr.transaction->setOption(option, value); @@ -593,6 +615,8 @@ ThreadFuture MultiVersionTransaction::onError(Error const& e) { } void MultiVersionTransaction::reset() { + persistentOptions.clear(); + setDefaultOptions(db->dbState->transactionDefaultOptions); updateTransaction(); } @@ -630,13 +654,12 @@ Reference MultiVersionDatabase::debugCreateFromExistingDatabase(Refer } Reference MultiVersionDatabase::createTransaction() { - return Reference(new MultiVersionTransaction(Reference::addRef(this))); + return Reference(new MultiVersionTransaction(Reference::addRef(this), dbState->transactionDefaultOptions)); } void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional value) { MutexHolder holder(dbState->optionLock); - auto itr = FDBDatabaseOptions::optionInfo.find(option); if(itr != FDBDatabaseOptions::optionInfo.end()) { TraceEvent("SetDatabaseOption").detail("Option", itr->second.name); @@ -646,11 +669,18 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional throw invalid_option(); } - if(dbState->db) { - dbState->db->setOption(option, value); + int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor; + if (defaultFor >= 0) { + ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != + FDBTransactionOptions::optionInfo.end()); + dbState->transactionDefaultOptions.addOption((FDBTransactionOptions::Option)defaultFor, value.castTo>()); } dbState->options.push_back(std::make_pair(option, value.castTo>())); + + if(dbState->db) { + dbState->db->setOption(option, value); + } } void MultiVersionDatabase::Connector::connect() { @@ -811,6 +841,11 @@ void MultiVersionDatabase::DatabaseState::cancelConnections() { // MultiVersionApi +bool MultiVersionApi::apiVersionAtLeast(int minVersion) { + ASSERT(MultiVersionApi::api->apiVersion != 0); + return MultiVersionApi::api->apiVersion >= minVersion; +} + // runOnFailedClients should be used cautiously. Some failed clients may not have successfully loaded all symbols. void MultiVersionApi::runOnExternalClients(std::function)> func, bool runOnFailedClients) { bool newFailure = false; diff --git a/fdbclient/MultiVersionTransaction.h b/fdbclient/MultiVersionTransaction.h index 6ddaeac8fa..b1a1c3372a 100644 --- a/fdbclient/MultiVersionTransaction.h +++ b/fdbclient/MultiVersionTransaction.h @@ -210,7 +210,7 @@ class MultiVersionDatabase; class MultiVersionTransaction : public ITransaction, ThreadSafeReferenceCounted { public: - MultiVersionTransaction(Reference db); + MultiVersionTransaction(Reference db, UniqueOrderedOptionList defaultOptions); void cancel(); void setVersion(Version v); @@ -261,6 +261,9 @@ private: TransactionInfo getTransaction(); void updateTransaction(); + void setDefaultOptions(UniqueOrderedOptionList options); + + std::vector>>> persistentOptions; }; struct ClientInfo : ThreadSafeReferenceCounted { @@ -341,6 +344,7 @@ private: std::vector> connectionAttempts; std::vector>>> options; + UniqueOrderedOptionList transactionDefaultOptions; Mutex optionLock; }; @@ -370,6 +374,8 @@ public: bool callbackOnMainThread; bool localClientDisabled; + static bool apiVersionAtLeast(int minVersion); + private: MultiVersionApi(); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6cea540491..4f0770f680 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -279,7 +279,6 @@ struct TrInfoChunk { ACTOR static Future transactionInfoCommitActor(Transaction *tr, std::vector *chunks) { state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); - state int retryCount = 0; loop{ try { tr->reset(); @@ -296,9 +295,6 @@ ACTOR static Future transactionInfoCommitActor(Transaction *tr, std::vecto return Void(); } catch (Error& e) { - retryCount++; - if (retryCount == 10) - throw; wait(tr->onError(e)); } } @@ -516,15 +512,13 @@ DatabaseContext::DatabaseContext( lockAware(lockAware), apiVersion(apiVersion), provisional(false), transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), - transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1), + transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0) { metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; - transactionMaxBackoff = CLIENT_KNOBS->FAILURE_MAX_DELAY; - transactionMaxSize = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; snapshotRywEnabled = apiVersionAtLeast(300) ? 1 : 0; logger = databaseLogger( this ); @@ -745,52 +739,43 @@ uint64_t extractHexOption( StringRef value ) { } void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional value) { - switch(option) { - case FDBDatabaseOptions::LOCATION_CACHE_SIZE: - locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits::max()); - break; - case FDBDatabaseOptions::MACHINE_ID: - clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone(value.get()) : Optional>(), clientLocality.machineId(), clientLocality.dcId() ); - if( clientInfo->get().proxies.size() ) - masterProxies = Reference( new ProxyInfo( clientInfo->get().proxies, clientLocality ) ); - server_interf.clear(); - locationCache.insert( allKeys, Reference() ); - break; - case FDBDatabaseOptions::MAX_WATCHES: - maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES); - break; - case FDBDatabaseOptions::DATACENTER_ID: - clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone(value.get()) : Optional>()); - if( clientInfo->get().proxies.size() ) - masterProxies = Reference( new ProxyInfo( clientInfo->get().proxies, clientLocality )); - server_interf.clear(); - locationCache.insert( allKeys, Reference() ); - break; - case FDBDatabaseOptions::TRANSACTION_TIMEOUT: - if( !apiVersionAtLeast(610) ) { - throw invalid_option(); - } - transactionTimeout = extractIntOption(value, 0, std::numeric_limits::max())/1000.0; - break; - case FDBDatabaseOptions::TRANSACTION_RETRY_LIMIT: - transactionMaxRetries = (int)extractIntOption(value, -1, std::numeric_limits::max()); - break; - case FDBDatabaseOptions::TRANSACTION_MAX_RETRY_DELAY: - validateOptionValue(value, true); - transactionMaxBackoff = extractIntOption(value, 0, std::numeric_limits::max()) / 1000.0; - break; - case FDBDatabaseOptions::TRANSACTION_SIZE_LIMIT: - validateOptionValue(value, true); - transactionMaxSize = extractIntOption(value, 32, CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT); - break; - case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE: - validateOptionValue(value, false); - snapshotRywEnabled++; - break; - case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE: - validateOptionValue(value, false); - snapshotRywEnabled--; - break; + int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor; + if (defaultFor >= 0) { + ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != + FDBTransactionOptions::optionInfo.end()); + transactionDefaults.addOption((FDBTransactionOptions::Option)option, value.castTo>()); + } + else { + switch(option) { + case FDBDatabaseOptions::LOCATION_CACHE_SIZE: + locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits::max()); + break; + case FDBDatabaseOptions::MACHINE_ID: + clientLocality = LocalityData( clientLocality.processId(), value.present() ? Standalone(value.get()) : Optional>(), clientLocality.machineId(), clientLocality.dcId() ); + if( clientInfo->get().proxies.size() ) + masterProxies = Reference( new ProxyInfo( clientInfo->get().proxies, clientLocality ) ); + server_interf.clear(); + locationCache.insert( allKeys, Reference() ); + break; + case FDBDatabaseOptions::MAX_WATCHES: + maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES); + break; + case FDBDatabaseOptions::DATACENTER_ID: + clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone(value.get()) : Optional>()); + if( clientInfo->get().proxies.size() ) + masterProxies = Reference( new ProxyInfo( clientInfo->get().proxies, clientLocality )); + server_interf.clear(); + locationCache.insert( allKeys, Reference() ); + break; + case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE: + validateOptionValue(value, false); + snapshotRywEnabled++; + break; + case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE: + validateOptionValue(value, false); + snapshotRywEnabled--; + break; + } } } @@ -839,6 +824,11 @@ Database Database::createDatabase( std::string connFileName, int apiVersion, Loc return Database::createDatabase(rccf, apiVersion, clientLocality); } +const UniqueOrderedOptionList& Database::getTransactionDefaults() const { + ASSERT(db); + return db->transactionDefaults; +} + extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs); Cluster::Cluster( Reference connFile, Reference> connectedCoordinatorsNum, int apiVersion ) @@ -2457,8 +2447,6 @@ double Transaction::getBackoff(int errCode) { } TransactionOptions::TransactionOptions(Database const& cx) { - maxBackoff = cx->transactionMaxBackoff; - sizeLimit = cx->transactionMaxSize; reset(cx); if (BUGGIFY) { commitOnFirstProxy = true; @@ -2472,11 +2460,9 @@ TransactionOptions::TransactionOptions() { } void TransactionOptions::reset(Database const& cx) { - double oldMaxBackoff = maxBackoff; - uint32_t oldSizeLimit = sizeLimit; memset(this, 0, sizeof(*this)); - maxBackoff = cx->apiVersionAtLeast(610) ? oldMaxBackoff : cx->transactionMaxBackoff; - sizeLimit = oldSizeLimit; + maxBackoff = CLIENT_KNOBS->DEFAULT_MAX_BACKOFF; + sizeLimit = CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; lockAware = cx->lockAware; } @@ -2503,7 +2489,6 @@ void Transaction::reset() { void Transaction::fullReset() { reset(); backoff = CLIENT_KNOBS->DEFAULT_BACKOFF; - options.maxBackoff = getDatabase()->transactionMaxBackoff; } int Transaction::apiVersionAtLeast(int minVersion) const { @@ -3150,8 +3135,7 @@ Future> Transaction::getVersionstamp() { } Future Transaction::onError( Error const& e ) { - if (e.code() == error_code_success) - { + if (e.code() == error_code_success) { return client_invalid_operation(); } if (e.code() == error_code_not_committed || @@ -3175,7 +3159,7 @@ Future Transaction::onError( Error const& e ) { double backoff = getBackoff(e.code()); reset(); - return delay( backoff, info.taskID ); + return delay(backoff, info.taskID); } if (e.code() == error_code_transaction_too_old || e.code() == error_code_future_version) @@ -3187,7 +3171,7 @@ Future Transaction::onError( Error const& e ) { double maxBackoff = options.maxBackoff; reset(); - return delay( std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID ); + return delay(std::min(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, maxBackoff), info.taskID); } if(g_network->isSimulated() && ++numErrors % 10 == 0) diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 0fbf76cfe4..4419646da3 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -90,6 +90,8 @@ public: inline DatabaseContext* extractPtr() { return db.extractPtr(); } DatabaseContext* operator->() const { return db.getPtr(); } + const UniqueOrderedOptionList& getTransactionDefaults() const; + private: Reference db; }; diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 7d006dd85a..8bc1f3683c 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1124,7 +1124,8 @@ public: }; ReadYourWritesTransaction::ReadYourWritesTransaction( Database const& cx ) : cache(&arena), writes(&arena), tr(cx), retries(0), creationTime(now()), commitStarted(false), options(tr), deferredError(cx->deferredError) { - resetTimeout(); + std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions)); + applyPersistentOptions(); } ACTOR Future timebomb(double endTime, Promise resetPromise) { @@ -1473,36 +1474,16 @@ void ReadYourWritesTransaction::writeRangeToNativeTransaction( KeyRangeRef const } ReadYourWritesTransactionOptions::ReadYourWritesTransactionOptions(Transaction const& tr) { - Database cx = tr.getDatabase(); - timeoutInSeconds = cx->transactionTimeout; - maxRetries = cx->transactionMaxRetries; reset(tr); } void ReadYourWritesTransactionOptions::reset(Transaction const& tr) { - double oldTimeout = timeoutInSeconds; - int oldMaxRetries = maxRetries; memset(this, 0, sizeof(*this)); - if( tr.apiVersionAtLeast(610) ) { - // Starting in API version 610, these options are not cleared after reset. - timeoutInSeconds = oldTimeout; - maxRetries = oldMaxRetries; - } - else { - Database cx = tr.getDatabase(); - maxRetries = cx->transactionMaxRetries; - timeoutInSeconds = cx->transactionTimeout; - } + timeoutInSeconds = 0.0; + maxRetries = -1; snapshotRywEnabled = tr.getDatabase()->snapshotRywEnabled; } -void ReadYourWritesTransactionOptions::fullReset(Transaction const& tr) { - reset(tr); - Database cx = tr.getDatabase(); - maxRetries = cx->transactionMaxRetries; - timeoutInSeconds = cx->transactionTimeout; -} - bool ReadYourWritesTransactionOptions::getAndResetWriteConflictDisabled() { bool disabled = nextWriteDisableConflictRange; nextWriteDisableConflictRange = false; @@ -1777,7 +1758,15 @@ Future> ReadYourWritesTransaction::getVersionstamp() { return waitOrError(tr.getVersionstamp(), resetPromise.getFuture()); } -void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { +void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { + setOptionImpl(option, value); + + if(FDBTransactionOptions::optionInfo[option].persistent) { + persistentOptions.push_back(std::make_pair(option, value.castTo>())); + } +} + +void ReadYourWritesTransaction::setOptionImpl( FDBTransactionOptions::Option option, Optional value ) { switch(option) { case FDBTransactionOptions::READ_YOUR_WRITES_DISABLE: validateOptionValue(value, false); @@ -1815,8 +1804,8 @@ void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, case FDBTransactionOptions::TIMEOUT: options.timeoutInSeconds = extractIntOption(value, 0, std::numeric_limits::max())/1000.0; - resetTimeout(); - break; + resetTimeout(); + break; case FDBTransactionOptions::RETRY_LIMIT: options.maxRetries = (int)extractIntOption(value, -1, std::numeric_limits::max()); @@ -1872,6 +1861,7 @@ void ReadYourWritesTransaction::operator=(ReadYourWritesTransaction&& r) BOOST_N transactionDebugInfo = r.transactionDebugInfo; cache.arena = &arena; writes.arena = &arena; + persistentOptions = std::move(r.persistentOptions); } ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& r) BOOST_NOEXCEPT : @@ -1894,12 +1884,29 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(ReadYourWritesTransaction&& readConflicts = std::move(r.readConflicts); watchMap = std::move( r.watchMap ); r.resetPromise = Promise(); + persistentOptions = std::move(r.persistentOptions); } Future ReadYourWritesTransaction::onError(Error const& e) { return RYWImpl::onError( this, e ); } +void ReadYourWritesTransaction::applyPersistentOptions() { + Optional timeout; + for (auto option : persistentOptions) { + if(option.first == FDBTransactionOptions::TIMEOUT) { + timeout = option.second.castTo(); + } + else { + setOptionImpl(option.first, option.second.castTo()); + } + } + + if(timeout.present()) { + setOptionImpl(FDBTransactionOptions::TIMEOUT, timeout); + } +} + void ReadYourWritesTransaction::resetRyow() { Promise oldReset = resetPromise; resetPromise = Promise(); @@ -1917,7 +1924,7 @@ void ReadYourWritesTransaction::resetRyow() { if(tr.apiVersionAtLeast(16)) { options.reset(tr); - resetTimeout(); + applyPersistentOptions(); } if ( !oldReset.isSet() ) @@ -1933,9 +1940,11 @@ void ReadYourWritesTransaction::reset() { retries = 0; creationTime = now(); timeoutActor.cancel(); - options.fullReset(tr); + persistentOptions.clear(); + options.reset(tr); transactionDebugInfo.clear(); tr.fullReset(); + std::copy(tr.getDatabase().getTransactionDefaults().begin(), tr.getDatabase().getTransactionDefaults().end(), std::back_inserter(persistentOptions)); resetRyow(); } diff --git a/fdbclient/ReadYourWrites.h b/fdbclient/ReadYourWrites.h index c5d4e0fafc..f4b93eabcc 100644 --- a/fdbclient/ReadYourWrites.h +++ b/fdbclient/ReadYourWrites.h @@ -44,7 +44,6 @@ struct ReadYourWritesTransactionOptions { ReadYourWritesTransactionOptions() {} explicit ReadYourWritesTransactionOptions(Transaction const& tr); void reset(Transaction const& tr); - void fullReset(Transaction const& tr); bool getAndResetWriteConflictDisabled(); }; @@ -160,6 +159,10 @@ private: void debugLogRetries(Optional error = Optional()); + void setOptionImpl( FDBTransactionOptions::Option option, Optional value = Optional() ); + void applyPersistentOptions(); + + std::vector>>> persistentOptions; ReadYourWritesTransactionOptions options; }; diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 130b1652ce..d341ac3c5d 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -53,6 +53,8 @@ Reference ThreadSafeDatabase::createTransaction() { void ThreadSafeDatabase::setOption( FDBDatabaseOptions::Option option, Optional value) { DatabaseContext *db = this->db; Standalone> passValue = value; + + // ThreadSafeDatabase is not allowed to do anything with options except pass them through to RYW. onMainThreadVoid( [db, option, passValue](){ db->checkDeferredError(); db->setOption(option, passValue.contents()); @@ -274,6 +276,8 @@ ThreadFuture> ThreadSafeTransaction::getVersionstamp() { void ThreadSafeTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { ReadYourWritesTransaction *tr = this->tr; Standalone> passValue = value; + + // ThreadSafeTransaction is not allowed to do anything with options except pass them through to RYW. onMainThreadVoid( [tr, option, passValue](){ tr->setOption(option, passValue.contents()); }, &tr->deferredError ); } diff --git a/fdbclient/vexillographer/cpp.cs b/fdbclient/vexillographer/cpp.cs index 0d17cbdb5c..4ea844f6ca 100644 --- a/fdbclient/vexillographer/cpp.cs +++ b/fdbclient/vexillographer/cpp.cs @@ -47,8 +47,8 @@ namespace vexillographer private static string getCInfoLine(Option o, string indent, string structName) { - return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6})", - indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower()); + return String.Format("{0}ADD_OPTION_INFO({1}, {2}, \"{2}\", \"{3}\", \"{4}\", {5}, {6}, {7}, {8})", + indent, structName, o.name.ToUpper(), o.comment, o.getParameterComment(), (o.paramDesc != null).ToString().ToLower(), o.hidden.ToString().ToLower(), o.persistent.ToString().ToLower(), o.defaultFor); } private static void writeCppInfo(TextWriter outFile, Scope scope, IEnumerable - diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index b69faad5a0..5f7f45ea19 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -282,6 +282,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( PROXY_SPIN_DELAY, 0.01 ); init( UPDATE_REMOTE_LOG_VERSION_INTERVAL, 2.0 ); init( MAX_TXS_POP_VERSION_HISTORY, 1e5 ); + init( PROXY_FORWARD_DELAY, 10.0 ); // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 1953b547b4..f8517ababe 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -227,6 +227,7 @@ public: double PROXY_SPIN_DELAY; double UPDATE_REMOTE_LOG_VERSION_INTERVAL; int MAX_TXS_POP_VERSION_HISTORY; + double PROXY_FORWARD_DELAY; // Master Server double COMMIT_SLEEP_TIME; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 8051ddb662..83cf0d51db 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -965,7 +965,7 @@ ACTOR Future commitBatch( break; } when(GetReadVersionReply v = wait(self->getConsistentReadVersion.getReply(GetReadVersionRequest(0, GetReadVersionRequest::PRIORITY_SYSTEM_IMMEDIATE | GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)))) { - if(v.version > self->committedVersion.get()) { + if(!v.newClientInfo.present() && v.version > self->committedVersion.get()) { self->locked = v.locked; self->metadataVersion = v.metadataVersion; self->committedVersion.set(v.version); @@ -1782,33 +1782,69 @@ ACTOR Future masterProxyServerCore( ACTOR Future checkRemoved(Reference> db, uint64_t recoveryCount, MasterProxyInterface myInterface) { loop{ - if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface)) - throw worker_removed(); + if (db->get().recoveryCount >= recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), myInterface)) { + throw worker_removed(); + } wait(db->onChange()); } } +ACTOR Future forwardProxy(ClientDBInfo info, RequestStream commit, RequestStream getConsistentReadVersion, RequestStream getKeyServersLocations) { + loop { + choose { + when(CommitTransactionRequest req = waitNext(commit.getFuture())) { + CommitID rep; + rep.newClientInfo = info; + req.reply.send(rep); + } + when(GetReadVersionRequest req = waitNext(getConsistentReadVersion.getFuture())) { + GetReadVersionReply rep; + rep.newClientInfo = info; + req.reply.send(rep); + } + when(GetKeyServerLocationsRequest req = waitNext(getKeyServersLocations.getFuture())) { + GetKeyServerLocationsReply rep; + rep.newClientInfo = info; + req.reply.send(rep); + } + } + wait(yield()); + } +} + ACTOR Future masterProxyServer( MasterProxyInterface proxy, InitializeMasterProxyRequest req, Reference> db, std::string whitelistBinPaths) { + state Future core; try { - state Future core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths); - loop choose{ - when(wait(core)) { return Void(); } - when(wait(checkRemoved(db, req.recoveryCount, proxy))) {} - } + core = masterProxyServerCore(proxy, req.master, db, req.recoveryCount, req.recoveryTransactionVersion, req.firstProxy, whitelistBinPaths); + wait(core || checkRemoved(db, req.recoveryCount, proxy)); } catch (Error& e) { - if (e.code() == error_code_actor_cancelled || e.code() == error_code_worker_removed || e.code() == error_code_tlog_stopped || - e.code() == error_code_master_tlog_failed || e.code() == error_code_coordinators_changed || e.code() == error_code_coordinated_state_conflict || - e.code() == error_code_new_coordinators_timed_out) - { - TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true); + TraceEvent("MasterProxyTerminated", proxy.id()).error(e, true); + + if (e.code() != error_code_worker_removed && e.code() != error_code_tlog_stopped && + e.code() != error_code_master_tlog_failed && e.code() != error_code_coordinators_changed && + e.code() != error_code_coordinated_state_conflict && e.code() != error_code_new_coordinators_timed_out) { + throw; + } + } + core.cancel(); + state Future finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY); + loop { + if(finishForward.isReady()) { return Void(); } - throw; + if(db->get().client.proxies.size() > 0 && !db->get().client.proxies[0].provisional && db->get().recoveryCount >= req.recoveryCount + && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), proxy)) { + core = forwardProxy(db->get().client, proxy.commit, proxy.getConsistentReadVersion, proxy.getKeyServersLocations); + proxy = MasterProxyInterface(); + wait(finishForward); + return Void(); + } + wait(db->onChange() || finishForward); } } diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index d0ac5392a5..d7502b5f35 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -1357,7 +1357,7 @@ ACTOR Future masterCore( Reference self ) { // SOMEDAY: For faster recovery, do this and setDBState asynchronously and don't wait for them // unless we want to change TLogs wait((success(recoveryCommit) && sendInitialCommitToResolvers(self)) ); - if(recoveryCommit.isReady() && recoveryCommit.get().isError()) { + if(recoveryCommit.isReady() && ( recoveryCommit.get().isError() || recoveryCommit.get().get().newClientInfo.present() )) { TEST(true); // Master recovery failed because of the initial commit failed throw master_recovery_failed(); } diff --git a/fdbserver/workloads/ConsistencyCheck.actor.cpp b/fdbserver/workloads/ConsistencyCheck.actor.cpp index 0f4f064bf4..f7a67f09dc 100644 --- a/fdbserver/workloads/ConsistencyCheck.actor.cpp +++ b/fdbserver/workloads/ConsistencyCheck.actor.cpp @@ -354,7 +354,7 @@ struct ConsistencyCheckWorkload : TestWorkload ErrorOr shards = keyServerLocationFutures[i].get(); //If performing quiescent check, then all master proxies should be reachable. Otherwise, only one needs to be reachable - if (self->performQuiescentChecks && !shards.present()) + if (self->performQuiescentChecks && (!shards.present() || shards.get().newClientInfo.present())) { TraceEvent("ConsistencyCheck_MasterProxyUnavailable").detail("MasterProxyID", proxyInfo->getId(i)); self->testFailure("Master proxy unavailable"); @@ -363,7 +363,7 @@ struct ConsistencyCheckWorkload : TestWorkload //Get the list of shards if one was returned. If not doing a quiescent check, we can break if it is. //If we are doing a quiescent check, then we only need to do this for the first shard. - if (shards.present() && !keyServersInsertedForThisIteration) + if (shards.present() && !shards.get().newClientInfo.present() && !keyServersInsertedForThisIteration) { keyServers.insert(keyServers.end(), shards.get().results.begin(), shards.get().results.end()); keyServersInsertedForThisIteration = true; From a5a6f8431cb2404b69d58a15b0cbe6e9a7146dce Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Mon, 8 Jul 2019 14:01:04 -0700 Subject: [PATCH 082/136] Add a random UID to TransactionMetrics in case a client opens multiple connections and also a field to indicate whether the connection is internal. Convert some of the metrics to our Counter object instead of running totals. --- fdbbackup/backup.actor.cpp | 4 +- fdbcli/fdbcli.actor.cpp | 2 +- fdbclient/DatabaseContext.h | 40 +++++----- fdbclient/NativeAPI.actor.cpp | 93 ++++++++++++----------- fdbclient/NativeAPI.actor.h | 4 +- fdbclient/ThreadSafeTransaction.actor.cpp | 2 +- fdbserver/Restore.actor.cpp | 2 +- fdbserver/tester.actor.cpp | 2 +- fdbserver/worker.actor.cpp | 2 +- flow/Stats.actor.cpp | 20 +++-- flow/Stats.h | 21 +++++ 11 files changed, 114 insertions(+), 78 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 1e4fd786e2..5d62be9a8b 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -3244,7 +3244,7 @@ int main(int argc, char* argv[]) { } try { - db = Database::createDatabase(ccf, -1, localities); + db = Database::createDatabase(ccf, -1, true, localities); } catch (Error& e) { fprintf(stderr, "ERROR: %s\n", e.what()); @@ -3266,7 +3266,7 @@ int main(int argc, char* argv[]) { } try { - sourceDb = Database::createDatabase(sourceCcf, -1, localities); + sourceDb = Database::createDatabase(sourceCcf, -1, true, localities); } catch (Error& e) { fprintf(stderr, "ERROR: %s\n", e.what()); diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index ebb63a82b9..a84712ddfd 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -2516,7 +2516,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { TraceEvent::setNetworkThread(); try { - db = Database::createDatabase(ccf, -1); + db = Database::createDatabase(ccf, -1, false); if (!opt.exec.present()) { printf("Using cluster file `%s'.\n", ccf->getFilename().c_str()); } diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 4c1c21dc6a..35eb8e6f71 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -58,7 +58,7 @@ public: ~DatabaseContext(); - Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, dbId, taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion )); } + Database clone() const { return Database(new DatabaseContext( cluster, clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, internal, apiVersion )); } std::pair> getCachedLocation( const KeyRef&, bool isBackward = false ); bool getCachedLocations( const KeyRangeRef&, vector>>&, int limit, bool reverse ); @@ -97,8 +97,8 @@ public: //private: explicit DatabaseContext( Reference cluster, Reference> clientDBInfo, - Future clientInfoMonitor, Standalone dbId, TaskPriority taskID, LocalityData const& clientLocality, - bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST ); + Future clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, + bool enableLocalityLoadBalance, bool lockAware, bool internal = true, int apiVersion = Database::API_VERSION_LATEST ); explicit DatabaseContext( const Error &err ); @@ -133,22 +133,26 @@ public: std::map< UID, StorageServerInfo* > server_interf; - Standalone dbId; + UID dbId; + bool internal; + + CounterCollection cc; + + Counter transactionReadVersions; + Counter transactionLogicalReads; + Counter transactionPhysicalReads; + Counter transactionCommittedMutations; + Counter transactionCommittedMutationBytes; + Counter transactionsCommitStarted; + Counter transactionsCommitCompleted; + Counter transactionsTooOld; + Counter transactionsFutureVersions; + Counter transactionsNotCommitted; + Counter transactionsMaybeCommitted; + Counter transactionsResourceConstrained; + Counter transactionsProcessBehind; + Counter transactionWaitsForFullRecovery; - int64_t transactionReadVersions; - int64_t transactionLogicalReads; - int64_t transactionPhysicalReads; - int64_t transactionCommittedMutations; - int64_t transactionCommittedMutationBytes; - int64_t transactionsCommitStarted; - int64_t transactionsCommitCompleted; - int64_t transactionsTooOld; - int64_t transactionsFutureVersions; - int64_t transactionsNotCommitted; - int64_t transactionsMaybeCommitted; - int64_t transactionsResourceConstrained; - int64_t transactionsProcessBehind; - int64_t transactionWaitsForFullRecovery; ContinuousSample latencies, readLatencies, commitLatencies, GRVLatencies, mutationsPerCommit, bytesPerCommit; int outstandingWatches; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8ada99503f..b9cea4c1ad 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -208,24 +208,18 @@ template <> void addref( DatabaseContext* ptr ) { ptr->addref(); } template <> void delref( DatabaseContext* ptr ) { ptr->delref(); } ACTOR Future databaseLogger( DatabaseContext *cx ) { + state double lastLogged = 0; loop { - wait( delay( CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID ) ); - TraceEvent("TransactionMetrics") + wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, cx->taskID)); + TraceEvent ev("TransactionMetrics", cx->dbId); + + ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Cluster", cx->cluster && cx->getConnectionFile() ? cx->getConnectionFile()->getConnectionString().clusterKeyName().toString() : "") - .detail("ReadVersions", cx->transactionReadVersions) - .detail("LogicalUncachedReads", cx->transactionLogicalReads) - .detail("PhysicalReadRequests", cx->transactionPhysicalReads) - .detail("CommittedMutations", cx->transactionCommittedMutations) - .detail("CommittedMutationBytes", cx->transactionCommittedMutationBytes) - .detail("CommitStarted", cx->transactionsCommitStarted) - .detail("CommitCompleted", cx->transactionsCommitCompleted) - .detail("TooOld", cx->transactionsTooOld) - .detail("FutureVersions", cx->transactionsFutureVersions) - .detail("NotCommitted", cx->transactionsNotCommitted) - .detail("MaybeCommitted", cx->transactionsMaybeCommitted) - .detail("ResourceConstrained", cx->transactionsResourceConstrained) - .detail("ProcessBehind", cx->transactionsProcessBehind) - .detail("MeanLatency", cx->latencies.mean()) + .detail("Internal", cx->internal); + + cx->cc.logToTraceEvent(ev); + + ev.detail("MeanLatency", cx->latencies.mean()) .detail("MedianLatency", cx->latencies.median()) .detail("Latency90", cx->latencies.percentile(0.90)) .detail("Latency98", cx->latencies.percentile(0.98)) @@ -245,12 +239,15 @@ ACTOR Future databaseLogger( DatabaseContext *cx ) { .detail("MeanBytesPerCommit", cx->bytesPerCommit.mean()) .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()); + cx->latencies.clear(); cx->readLatencies.clear(); cx->GRVLatencies.clear(); cx->commitLatencies.clear(); cx->mutationsPerCommit.clear(); cx->bytesPerCommit.clear(); + + lastLogged = now(); } } @@ -508,18 +505,21 @@ ACTOR static Future getHealthMetricsActor(DatabaseContext *cx, bo Future DatabaseContext::getHealthMetrics(bool detailed = false) { return getHealthMetricsActor(this, detailed); } - DatabaseContext::DatabaseContext( - Reference cluster, Reference> clientInfo, Future clientInfoMonitor, Standalone dbId, - TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) - : cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), - lockAware(lockAware), apiVersion(apiVersion), provisional(false), - transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), - transactionsCommitStarted(0), transactionsCommitCompleted(0), transactionsTooOld(0), transactionsFutureVersions(0), transactionsNotCommitted(0), - transactionsMaybeCommitted(0), transactionsResourceConstrained(0), transactionsProcessBehind(0), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1), + Reference cluster, Reference> clientInfo, Future clientInfoMonitor, + TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, bool internal, int apiVersion ) + : cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), + lockAware(lockAware), apiVersion(apiVersion), provisional(false), cc("TransactionMetrics"), + transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), + transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), + transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), + transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), + transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), - healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0) + healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal) { + dbId = deterministicRandom()->randomUniqueID(); + metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; @@ -539,7 +539,14 @@ DatabaseContext::DatabaseContext( clientStatusUpdater.actor = clientStatusUpdateActor(this); } -DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000) {} +DatabaseContext::DatabaseContext( const Error &err ) : deferredError(err), cc("TransactionMetrics"), + transactionReadVersions("ReadVersions", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), + transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), + transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), + transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), + transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), latencies(1000), readLatencies(1000), commitLatencies(1000), + GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), + internal(false) {} ACTOR static Future monitorClientInfo( Reference>> clusterInterface, Reference ccf, Reference> outInfo, Reference> connectedCoordinatorsNumDelayed ) { try { @@ -632,11 +639,11 @@ Database DatabaseContext::create(Reference>> Reference> clientInfo(new AsyncVar()); Future clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed); - return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false)); + return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, true)); } Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) { - return Database( new DatabaseContext( Reference(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) ); + return Database( new DatabaseContext( Reference(nullptr), clientInfo, clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, true, apiVersion ) ); } DatabaseContext::~DatabaseContext() { @@ -816,7 +823,7 @@ Reference DatabaseContext::getConnectionFile() { return cluster->getConnectionFile(); } -Database Database::createDatabase( Reference connFile, int apiVersion, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) { +Database Database::createDatabase( Reference connFile, int apiVersion, bool internal, LocalityData const& clientLocality, DatabaseContext *preallocatedDb ) { Reference> connectedCoordinatorsNum(new AsyncVar(0)); // Number of connected coordinators for the client Reference> connectedCoordinatorsNumDelayed(new AsyncVar(0)); Reference cluster(new Cluster(connFile, connectedCoordinatorsNum, apiVersion)); @@ -825,18 +832,18 @@ Database Database::createDatabase( Reference connFile, in DatabaseContext *db; if(preallocatedDb) { - db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); + db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion); } else { - db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); + db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, true, false, internal, apiVersion); } return Database(db); } -Database Database::createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality ) { +Database Database::createDatabase( std::string connFileName, int apiVersion, bool internal, LocalityData const& clientLocality ) { Reference rccf = Reference(new ClusterConnectionFile(ClusterConnectionFile::lookupClusterFileName(connFileName).first)); - return Database::createDatabase(rccf, apiVersion, clientLocality); + return Database::createDatabase(rccf, apiVersion, internal, clientLocality); } extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs); @@ -2718,7 +2725,7 @@ ACTOR static Future tryCommit( Database cx, Reference tr->versionstampPromise.send(ret); tr->numErrors = 0; - cx->transactionsCommitCompleted++; + ++cx->transactionsCommitCompleted; cx->transactionCommittedMutations += req.transaction.mutations.size(); cx->transactionCommittedMutationBytes += req.transaction.mutations.expectedSize(); @@ -2793,7 +2800,7 @@ Future Transaction::commitMutations() { return Void(); } - cx->transactionsCommitStarted++; + ++cx->transactionsCommitStarted; if(options.readOnly) return transaction_read_only(); @@ -3126,7 +3133,7 @@ ACTOR Future extractReadVersion(DatabaseContext* cx, Reference Transaction::getReadVersion(uint32_t flags) { - cx->transactionReadVersions++; + ++cx->transactionReadVersions; flags |= options.getReadVersionFlags; auto& batcher = cx->versionBatcher[ flags ]; @@ -3162,15 +3169,15 @@ Future Transaction::onError( Error const& e ) { e.code() == error_code_cluster_not_fully_recovered) { if(e.code() == error_code_not_committed) - cx->transactionsNotCommitted++; + ++cx->transactionsNotCommitted; if(e.code() == error_code_commit_unknown_result) - cx->transactionsMaybeCommitted++; + ++cx->transactionsMaybeCommitted; if (e.code() == error_code_proxy_memory_limit_exceeded) - cx->transactionsResourceConstrained++; + ++cx->transactionsResourceConstrained; if (e.code() == error_code_process_behind) - cx->transactionsProcessBehind++; + ++cx->transactionsProcessBehind; if (e.code() == error_code_cluster_not_fully_recovered) { - cx->transactionWaitsForFullRecovery++; + ++cx->transactionWaitsForFullRecovery; } double backoff = getBackoff(e.code()); @@ -3181,9 +3188,9 @@ Future Transaction::onError( Error const& e ) { e.code() == error_code_future_version) { if( e.code() == error_code_transaction_too_old ) - cx->transactionsTooOld++; + ++cx->transactionsTooOld; else if( e.code() == error_code_future_version ) - cx->transactionsFutureVersions++; + ++cx->transactionsFutureVersions; double maxBackoff = options.maxBackoff; reset(); diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index b7c3aa6d71..92bde7817b 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -74,8 +74,8 @@ class Database { public: enum { API_VERSION_LATEST = -1 }; - static Database createDatabase( Reference connFile, int apiVersion, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr ); - static Database createDatabase( std::string connFileName, int apiVersion, LocalityData const& clientLocality=LocalityData() ); + static Database createDatabase( Reference connFile, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData(), DatabaseContext *preallocatedDb=nullptr ); + static Database createDatabase( std::string connFileName, int apiVersion, bool internal=true, LocalityData const& clientLocality=LocalityData() ); Database() {} // an uninitialized database can be destructed or reassigned safely; that's it void operator= ( Database const& rhs ) { db = rhs.db; } diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 130b1652ce..d074515dba 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -68,7 +68,7 @@ ThreadSafeDatabase::ThreadSafeDatabase(std::string connFilename, int apiVersion) onMainThreadVoid([db, connFile, apiVersion](){ try { - Database::createDatabase(connFile, apiVersion, LocalityData(), db).extractPtr(); + Database::createDatabase(connFile, apiVersion, false, LocalityData(), db).extractPtr(); } catch(Error &e) { new (db) DatabaseContext(e); diff --git a/fdbserver/Restore.actor.cpp b/fdbserver/Restore.actor.cpp index a221a8593d..81f82ae387 100644 --- a/fdbserver/Restore.actor.cpp +++ b/fdbserver/Restore.actor.cpp @@ -24,7 +24,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. ACTOR Future restoreWorker(Reference ccf, LocalityData locality) { - state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST,locality); + state Database cx = Database::createDatabase(ccf->getFilename(), Database::API_VERSION_LATEST, true, locality); state RestoreInterface interf; interf.initEndpoints(); state Optional leaderInterf; diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 3e977c9493..2904d96015 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -505,7 +505,7 @@ ACTOR Future testerServerWorkload( WorkloadRequest work, Reference workerServer( if(metricsPrefix.size() > 0) { if( metricsConnFile.size() > 0) { try { - state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, locality); + state Database db = Database::createDatabase(metricsConnFile, Database::API_VERSION_LATEST, true, locality); metricsLogger = runMetrics( db, KeyRef(metricsPrefix) ); } catch(Error &e) { TraceEvent(SevWarnAlways, "TDMetricsBadClusterFile").error(e).detail("ConnFile", metricsConnFile); diff --git a/flow/Stats.actor.cpp b/flow/Stats.actor.cpp index 8d0afa7455..751130bc25 100644 --- a/flow/Stats.actor.cpp +++ b/flow/Stats.actor.cpp @@ -69,6 +69,13 @@ void Counter::clear() { metric = 0; } +void CounterCollection::logToTraceEvent(TraceEvent &te) const { + for (ICounter* c : counters) { + te.detail(c->getName().c_str(), c); + c->resetInterval(); + } +} + ACTOR Future traceCounters(std::string traceEventName, UID traceEventID, double interval, CounterCollection* counters, std::string trackLatestName) { wait(delay(0)); // Give an opportunity for all members used in special counters to be initialized @@ -80,15 +87,12 @@ ACTOR Future traceCounters(std::string traceEventName, UID traceEventID, d loop{ TraceEvent te(traceEventName.c_str(), traceEventID); te.detail("Elapsed", now() - last_interval); - for (ICounter* c : counters->counters) { - if (c->hasRate() && c->hasRoughness()) - te.detailf(c->getName().c_str(), "%g %g %lld", c->getRate(), c->getRoughness(), (long long)c->getValue()); - else - te.detail(c->getName().c_str(), c->getValue()); - c->resetInterval(); - } - if (!trackLatestName.empty()) + + counters->logToTraceEvent(te); + + if (!trackLatestName.empty()) { te.trackLatest(trackLatestName.c_str()); + } last_interval = now(); wait(delay(interval)); diff --git a/flow/Stats.h b/flow/Stats.h index 8044c4d802..24481c3024 100644 --- a/flow/Stats.h +++ b/flow/Stats.h @@ -62,12 +62,26 @@ struct ICounter { virtual void remove() {} }; +template<> +struct Traceable : std::true_type { + static std::string toString(ICounter const *counter) { + if (counter->hasRate() && counter->hasRoughness()) { + return format("%g %g %lld", counter->getRate(), counter->getRoughness(), (long long)counter->getValue()); + } + else { + return format("%lld", (long long)counter->getValue()); + } + } +}; + struct CounterCollection { CounterCollection(std::string name, std::string id = std::string()) : name(name), id(id) {} std::vector counters, counters_to_remove; ~CounterCollection() { for (auto c : counters_to_remove) c->remove(); } std::string name; std::string id; + + void logToTraceEvent(TraceEvent& te) const; }; struct Counter : ICounter, NonCopyable { @@ -97,6 +111,13 @@ private: Int64MetricHandle metric; }; +template<> +struct Traceable : std::true_type { + static std::string toString(Counter const& counter) { + return Traceable::toString((ICounter const*)&counter); + } +}; + template struct SpecialCounter : ICounter, FastAllocated>, NonCopyable { SpecialCounter(CounterCollection& collection, std::string const& name, F && f) : name(name), f(f) { collection.counters.push_back(this); collection.counters_to_remove.push_back(this); } From 6c8f50ca669b0c72fc9f7149988bce9c431957aa Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 8 Jul 2019 22:13:09 -0700 Subject: [PATCH 083/136] Improve the behavior of parallelPeekMore+onlySpilled. When onlySpilled transitions from true (don't peek memory) to false (do peek memory) as part of a parallel peek, we'll end up wasting the rest of the replies because we'll honor their onlySpilled=true setting and thus not have any additional data to return. Instead, we thread the onlySpilled back through in the same way that the ending version of the last peek is used overrides the requested starting version of the next peek. This simulated the same behavior that the client has, where the value of onlySpilled that we reply with comes back in the next request. I haven't actually seen it be a problem, but this should help make sure the onlySpilled transition when catching up doesn't ever cause any ill effects if a process starts riding the line between onlySpilled settings. --- fdbserver/OldTLogServer_6_0.actor.cpp | 15 ++++++++------- fdbserver/TLogServer.actor.cpp | 15 ++++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 227578d49f..07a3bee98d 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -265,7 +265,7 @@ struct TLogData : NonCopyable { int64_t overheadBytesDurable; struct PeekTrackerData { - std::map> sequence_version; + std::map>> sequence_version; double lastUpdate; }; @@ -1030,8 +1030,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = ver; + std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + req.begin = prevPeekData.first; + req.onlySpilled = prevPeekData.second; wait(yield()); } } catch( Error &e ) { @@ -1089,13 +1090,13 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& sequenceData = trackerData.sequence_version[sequence+1]; if(sequenceData.isSet()) { - if(sequenceData.getFuture().get() != rep.end) { + if(sequenceData.getFuture().get().first != rep.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(timed_out()); return Void(); } } else { - sequenceData.send(rep.end); + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } rep.begin = req.begin; } @@ -1163,13 +1164,13 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& sequenceData = trackerData.sequence_version[sequence+1]; if(sequenceData.isSet()) { - if(sequenceData.getFuture().get() != reply.end) { + if(sequenceData.getFuture().get().first != reply.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(timed_out()); return Void(); } } else { - sequenceData.send(reply.end); + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } reply.begin = req.begin; } diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 96a63c1d39..dc4728882f 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -315,7 +315,7 @@ struct TLogData : NonCopyable { int64_t overheadBytesDurable; struct PeekTrackerData { - std::map> sequence_version; + std::map>> sequence_version; double lastUpdate; }; @@ -1317,8 +1317,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } trackerData.lastUpdate = now(); - Version ver = wait(trackerData.sequence_version[sequence].getFuture()); - req.begin = ver; + std::pair prevPeekData = wait(trackerData.sequence_version[sequence].getFuture()); + req.begin = prevPeekData.first; + req.onlySpilled = prevPeekData.second; wait(yield()); } } catch( Error &e ) { @@ -1376,13 +1377,13 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& sequenceData = trackerData.sequence_version[sequence+1]; if(sequenceData.isSet()) { - if(sequenceData.getFuture().get() != rep.end) { + if(sequenceData.getFuture().get().first != rep.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(timed_out()); return Void(); } } else { - sequenceData.send(rep.end); + sequenceData.send(std::make_pair(rep.end, rep.onlySpilled)); } rep.begin = req.begin; } @@ -1537,13 +1538,13 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere } auto& sequenceData = trackerData.sequence_version[sequence+1]; if(sequenceData.isSet()) { - if(sequenceData.getFuture().get() != reply.end) { + if(sequenceData.getFuture().get().first != reply.end) { TEST(true); //tlog peek second attempt ended at a different version req.reply.sendError(timed_out()); return Void(); } } else { - sequenceData.send(reply.end); + sequenceData.send(std::make_pair(reply.end, reply.onlySpilled)); } reply.begin = req.begin; } From d2ef84a8f964372f971f8ab9ecdf7bc8584df1cb Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 8 Jul 2019 22:22:45 -0700 Subject: [PATCH 084/136] Add a TLogVersion::V4 And refactor some code to make adding more TLogVersions easier. --- fdbclient/FDBTypes.h | 4 ++- fdbserver/SimulatedCluster.actor.cpp | 24 ++++++------------ fdbserver/worker.actor.cpp | 25 ++++++++++++++++--- .../workloads/ConfigureDatabase.actor.cpp | 6 ++++- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 31c246ffcb..e78a5f1813 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -601,8 +601,9 @@ struct TLogVersion { // V1 = 1, // 4.6 is dispatched to via 6.0 V2 = 2, // 6.0 V3 = 3, // 6.1 + V4 = 4, // 6.2 MIN_SUPPORTED = V2, - MAX_SUPPORTED = V3, + MAX_SUPPORTED = V4, MIN_RECRUITABLE = V2, DEFAULT = V3, } version; @@ -624,6 +625,7 @@ struct TLogVersion { static ErrorOr FromStringRef( StringRef s ) { if (s == LiteralStringRef("2")) return V2; if (s == LiteralStringRef("3")) return V3; + if (s == LiteralStringRef("4")) return V4; return default_error_or(); } }; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 81330eac10..0257563af1 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -850,23 +850,15 @@ void SimulationConfig::generateNormalConfig(int minimumReplication, int minimumR } if (deterministicRandom()->random01() < 0.5) { - if (deterministicRandom()->random01() < 0.5) { - set_config("log_spill:=1"); // VALUE - } - int logVersion = deterministicRandom()->randomInt( 0, 3 ); - switch (logVersion) { - case 0: - break; - case 1: - set_config("log_version:=2"); // 6.0 - break; - case 2: - set_config("log_version:=3"); // 6.1 - break; - } + int logSpill = deterministicRandom()->randomInt( TLogSpillType::VALUE, TLogSpillType::END ); + set_config(format("log_spill:=%d", logSpill)); + int logVersion = deterministicRandom()->randomInt( TLogVersion::MIN_RECRUITABLE, TLogVersion::MAX_SUPPORTED+1 ); + set_config(format("log_version:=%d", logVersion)); } else { - set_config("log_version:=3"); // 6.1 - set_config("log_spill:=2"); // REFERENCE + if (deterministicRandom()->random01() < 0.7) + set_config(format("log_version:=%d", TLogVersion::MAX_SUPPORTED)); + if (deterministicRandom()->random01() < 0.5) + set_config(format("log_spill:=%d", TLogSpillType::DEFAULT)); } if(generateFearless || (datacenters == 2 && deterministicRandom()->random01() < 0.5)) { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 5f22334b44..0025e0aebc 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -278,10 +278,27 @@ struct TLogOptions { TLogFn tLogFnForOptions( TLogOptions options ) { auto tLogFn = tLog; - if ( options.version == TLogVersion::V2 && options.spillType == TLogSpillType::VALUE) return oldTLog_6_0::tLog; - if ( options.version == TLogVersion::V2 && options.spillType == TLogSpillType::REFERENCE) ASSERT(false); - if ( options.version == TLogVersion::V3 && options.spillType == TLogSpillType::VALUE ) return oldTLog_6_0::tLog; - if ( options.version == TLogVersion::V3 && options.spillType == TLogSpillType::REFERENCE) return tLog; + if ( options.spillType == TLogSpillType::VALUE ) { + switch (options.version) { + case TLogVersion::V2: + case TLogVersion::V3: + case TLogVersion::V4: + return oldTLog_6_0::tLog; + default: + ASSERT(false); + } + } + if ( options.spillType == TLogSpillType::REFERENCE ) { + switch (options.version) { + case TLogVersion::V2: + ASSERT(false); + case TLogVersion::V3: + case TLogVersion::V4: + return tLog; + default: + ASSERT(false); + } + } ASSERT(false); return tLogFn; } diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index b49b7b8f82..6ebabc899c 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -27,7 +27,11 @@ // "ssd" is an alias to the preferred type which skews the random distribution toward it but that's okay. static const char* storeTypes[] = { "ssd", "ssd-1", "ssd-2", "memory", "memory-1", "memory-2" }; -static const char* logTypes[] = { "log_engine:=1", "log_engine:=2", "log_spill:=1", "log_spill:=2", "log_version:=2", "log_version:=3" }; +static const char* logTypes[] = { + "log_engine:=1", "log_engine:=2", + "log_spill:=1", "log_spill:=2", + "log_version:=2", "log_version:=3", "log_version:=4" +}; static const char* redundancies[] = { "single", "double", "triple" }; std::string generateRegions() { From 44f11702a864f1219aa288543cb126b1ea4c7bf6 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 8 Jul 2019 22:25:01 -0700 Subject: [PATCH 085/136] Log Routers will prefer to peek from satellite logs. Formerly, they would prefer to peek from the primary's logs. Testing of a failed region rejoining the cluster revealed that this becomes quite a strain on the primary logs when extremely large volumes of peek requests are coming from the Log Routers. It happens that we have satellites that contain the same mutations with Log Router tags, that have no other peeking load, so we can prefer to use the satellite to peek rather than the primary to distribute load across TLogs better. Unfortunately, this revealed a latent bug in how tagged mutations in the KnownCommittedVersion->RecoveryVersion gap were copied across generations when the number of log router tags were decreased. Satellite TLogs would be assigned log router tags using the team-building based logic in getPushLocations(), whereas TLogs would internally re-index tags according to tag.id%logRouterTags. This mismatch would mean that we could have: Log0 -2:0 ----- -2:0 Log 0 Log1 -2:1 \ >--- -2:1,-2:0 (-2:2 mod 2 becomes -2:0) Log 1 Log2 -2:2 / And now we have data that's tagged as -2:0 on a TLog that's not the preferred location for -2:0, and therefore a BestLocationOnly cursor would miss the mutations. This was never noticed before, as we never used a satellite as a preferred location to peek from. Merge cursors always peek from all locations, and thus a peek for -2:0 that needed data from the satellites would have gone to both TLogs and merged the results. We now take this mod-based re-indexing into account when assigning which TLogs need to recover which tags from the previous generation, to make sure that tag.id%logRouterTags always results in the assigned TLog being the preferred location. Unfortunately, previously existing will potentially have existing satellites with log router tags indexed incorrectly, so this transition needs to be gated on a `log_version` transition. Old LogSets will have an old LogVersion, and we won't prefer the sattelite for peeking. Log Sets post-6.2 (opt-in) or post-6.3 (default) will be indexed correctly, and therefore we can safely offload peeking onto the satellites. --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/TLogServer.actor.cpp | 4 ++ fdbserver/TagPartitionedLogSystem.actor.cpp | 79 ++++++++++++++++----- 4 files changed, 69 insertions(+), 16 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 06798a485a..a20f1c7fa9 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -68,6 +68,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; init( VERSIONS_PER_BATCH, VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max(1,VERSIONS_PER_SECOND/1000); init( CONCURRENT_LOG_ROUTER_READS, 1 ); + init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED, 1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0; init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME, 1.0 ); init( DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME, 5.0 ); init( TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES, 2e9 ); if ( randomize && BUGGIFY ) TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES = 2e6; diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index dab19d5108..1342184cab 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -72,6 +72,7 @@ public: int64_t MAX_QUEUE_COMMIT_BYTES; int64_t VERSIONS_PER_BATCH; int CONCURRENT_LOG_ROUTER_READS; + int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME; double DISK_QUEUE_ADAPTER_MAX_SWITCH_TIME; int64_t TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 96a63c1d39..20e753ed9c 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2236,6 +2236,10 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st state Version tagAt = beginVersion; state Version lastVer = 0; + if (endVersion.present()) { + TraceEvent("TLogRestoreReplicationFactor", self->dbgid).detail("LogId", logData->logId).detail("Locality", logData->locality).detail("RecoverFrom", beginVersion).detail("RecoverTo", endVersion.get()); + } + while (!endVersion.present() || logData->version.get() < endVersion.get()) { loop { choose { diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2e25daae3b..e5ae6550ee 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -803,27 +803,52 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> localSets; - int bestSet = 0; + int bestPrimarySet = 0; + int bestSatelliteSet = -1; for(auto& log : tLogs) { if(log->isLocal && log->logServers.size()) { TraceEvent("TLogPeekLogRouterLocalSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogServers", log->logServerString()); localSets.push_back(log); - if(log->locality != tagLocalitySatellite) { - bestSet = localSets.size() - 1; + if(log->locality == tagLocalitySatellite) { + bestSatelliteSet = localSets.size() - 1; + } else { + bestPrimarySet = localSets.size() - 1; } } } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && + bestSatelliteSet != -1 && + tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) { + bestSet = bestSatelliteSet; + } TraceEvent("TLogPeekLogRouterSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); //FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies across the WAN return Reference( new ILogSystem::SetPeekCursor( localSets, bestSet, localSets[bestSet]->bestLocationFor( tag ), tag, begin, getPeekEnd(), true ) ); } else { - for( auto& log : tLogs ) { - if(log->logServers.size() && log->isLocal && log->locality != tagLocalitySatellite) { - TraceEvent("TLogPeekLogRouterBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogId", log->logServers[log->bestLocationFor( tag )]->get().id()); - return Reference( new ILogSystem::ServerPeekCursor( log->logServers[log->bestLocationFor( tag )], tag, begin, getPeekEnd(), false, true ) ); + int bestPrimarySet = -1; + int bestSatelliteSet = -1; + for( int i = 0; i < tLogs.size(); i++ ) { + const auto& log = tLogs[i]; + if(log->logServers.size() && log->isLocal) { + if (log->locality == tagLocalitySatellite) { + bestSatelliteSet = i; + break; + } else { + if (bestPrimarySet == -1) bestPrimarySet = i; + } } } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && + bestSatelliteSet != -1 && + tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) { + bestSet = bestSatelliteSet; + } + const auto& log = tLogs[bestSet]; + TraceEvent("TLogPeekLogRouterBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogId", log->logServers[log->bestLocationFor( tag )]->get().id()); + return Reference( new ILogSystem::ServerPeekCursor( log->logServers[log->bestLocationFor( tag )], tag, begin, getPeekEnd(), false, true ) ); } } bool firstOld = true; @@ -836,17 +861,26 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> localSets; for(auto& log : old.tLogs) { if(log->isLocal && log->logServers.size()) { TraceEvent("TLogPeekLogRouterOldLocalSet", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LogServers", log->logServerString()); localSets.push_back(log); - if(log->locality != tagLocalitySatellite) { - bestSet = localSets.size()-1; + if(log->locality == tagLocalitySatellite) { + bestSatelliteSet = localSets.size() - 1; + } else { + bestPrimarySet = localSets.size() - 1; } } } + int bestSet = bestPrimarySet; + if (SERVER_KNOBS->LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED && + bestSatelliteSet != -1 && + old.tLogs[bestSatelliteSet]->tLogVersion >= TLogVersion::V4 ) { + bestSet = bestSatelliteSet; + } TraceEvent("TLogPeekLogRouterOldSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("OldEpoch", old.epochEnd).detail("RecoveredAt", recoveredAt.present() ? recoveredAt.get() : -1).detail("FirstOld", firstOld); //FIXME: do this merge on one of the logs in the other data center to avoid sending multiple copies across the WAN @@ -1949,12 +1983,25 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterTags; } - for(int i = -1; i < oldLogSystem->logRouterTags; i++) { - Tag tag = i == -1 ? txsTag : Tag(tagLocalityLogRouter, i); - locations.clear(); - logSystem->tLogs[1]->getPushLocations( vector(1, tag), locations, 0 ); - for(int loc : locations) - sreqs[ loc ].recoverTags.push_back( tag ); + locations.clear(); + logSystem->tLogs[1]->getPushLocations( {txsTag}, locations, 0 ); + for(int loc : locations) + sreqs[ loc ].recoverTags.push_back( txsTag ); + + if (logSystem->logRouterTags) { + for(int i = 0; i < oldLogSystem->logRouterTags; i++) { + Tag tag = Tag(tagLocalityLogRouter, i); + // Sattelite logs will index a mutation with tagLocalityLogRouter with an id greater than + // the number of log routers as having an id mod the number of log routers. We thus need + // to make sure that if we're going from more log routers in the previous generation to + // less log routers in the newer one, that we map the log router tags onto satellites that + // are the preferred location for id%logRouterTags. + Tag pushLocation = Tag(tagLocalityLogRouter, i%logSystem->logRouterTags); + locations.clear(); + logSystem->tLogs[1]->getPushLocations( {pushLocation}, locations, 0 ); + for(int loc : locations) + sreqs[ loc ].recoverTags.push_back( tag ); + } } for( int i = 0; i < recr.satelliteTLogs.size(); i++ ) From 23963328cc82ef3d9724f840c286db96533694d9 Mon Sep 17 00:00:00 2001 From: mpilman Date: Tue, 9 Jul 2019 12:11:51 -0700 Subject: [PATCH 086/136] Compile relative paths into the debug info This is a suggestion to resolve #1780 This change introduces a new cmake flag `RELATIVE_DEBUG_PATHS`. If this flag is set or FDB is compiled with `-DFDB_RELEASE=ON`, the resulting binary will have debug information using relative file paths to source files. This simulates the behavior of the old build system but might break local debugging (making the debugger aware of build and source directory will be required). --- cmake/ConfigureCompiler.cmake | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 0d7d6a67a8..2da36b788c 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -8,6 +8,12 @@ set(FDB_RELEASE OFF CACHE BOOL "This is a building of a final release") set(USE_LD "LD" CACHE STRING "The linker to use for building: can be LD (system default, default choice), GOLD, or LLD") set(USE_LIBCXX OFF CACHE BOOL "Use libc++") set(USE_CCACHE OFF CACHE BOOL "Use ccache for compilation if available") +set(RELATIVE_DEBUG_PATHS OFF CACHE BOOL "Use relative file paths in debug info") + +set(rel_debug_paths OFF) +if(RELATIVE_DEBUG_PATHS OR FDB_RELEASE) + set(rel_debug_paths ON) +endif() if(USE_GPERFTOOLS) find_package(Gperftools REQUIRED) @@ -103,6 +109,10 @@ else() set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld -Wl,--disable-new-dtags") endif() + if(rel_debug_paths) + add_compile_options("-fdebug-prefix-map=${CMAKE_SOURCE_DIR}=." "-fdebug-prefix-map=${CMAKE_BINARY_DIR}=.") + endif() + # we always compile with debug symbols. CPack will strip them out # and create a debuginfo rpm add_compile_options(-ggdb -fno-omit-frame-pointer) From 1bac04509e7066d6390f8422af6478868ce17624 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Sat, 15 Jun 2019 15:02:43 -0700 Subject: [PATCH 087/136] Track the local ratekeeper rate as a percentage This value is reported in status for each storage server. --- documentation/sphinx/source/mr-status-json-schemas.rst.inc | 1 + fdbclient/Schemas.cpp | 1 + fdbserver/Status.actor.cpp | 1 + fdbserver/storageserver.actor.cpp | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 5b0099f142..cb18fa4cb1 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -36,6 +36,7 @@ "roles":[ { "query_queue_max":0, + "local_rate":0, "input_bytes":{ "hz":0.0, "counter":0, diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index 2e3db10c40..87d8b97ed1 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -56,6 +56,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "roles":[ { "query_queue_max":0, + "local_rate":0, "input_bytes":{ "hz":0.0, "counter":0, diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 47c61aeb9f..92d1498589 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -429,6 +429,7 @@ struct RolesInfo { obj["keys_queried"] = StatusCounter(storageMetrics.getValue("RowsQueried")).getStatus(); obj["mutation_bytes"] = StatusCounter(storageMetrics.getValue("MutationBytes")).getStatus(); obj["mutations"] = StatusCounter(storageMetrics.getValue("Mutations")).getStatus(); + obj.setKeyRawNumber("local_rate", storageMetrics.getValue("LocalRate")); Version version = storageMetrics.getInt64("Version"); Version durableVersion = storageMetrics.getInt64("DurableVersion"); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 7d5c59f0ba..ade11ab8a0 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -507,7 +507,7 @@ public: specialCounter(cc, "DurableVersion", [self](){ return self->durableVersion.get(); }); specialCounter(cc, "DesiredOldestVersion", [self](){ return self->desiredOldestVersion.get(); }); specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); - specialCounter(cc, "LocalRatekeeper", [self]{ return self->currentRate(); }); + specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; }); specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); }); specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); }); From 764a4591ada255cecc0e63fd0f1ebbbcba07a1c2 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 9 Jul 2019 14:17:26 -0700 Subject: [PATCH 088/136] Add a comment to internal flag --- fdbclient/DatabaseContext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 35eb8e6f71..dfd4ef0dd0 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -134,7 +134,7 @@ public: std::map< UID, StorageServerInfo* > server_interf; UID dbId; - bool internal; + bool internal; // Only contexts created through the C client and fdbcli are non-internal CounterCollection cc; From 705059dea129c9771bb4ef2ddb384a19c8218ee3 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 24 Jun 2019 17:36:55 -0700 Subject: [PATCH 089/136] Trace: Add support to print pointers --- flow/Trace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/flow/Trace.h b/flow/Trace.h index 6ba7e0e7db..a3957bddf4 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -210,6 +210,7 @@ FORMAT_TRACEABLE(unsigned long int, "%lu"); FORMAT_TRACEABLE(long long int, "%lld"); FORMAT_TRACEABLE(unsigned long long int, "%llu"); FORMAT_TRACEABLE(double, "%g"); +FORMAT_TRACEABLE(void*, "%p"); FORMAT_TRACEABLE(volatile long, "%ld"); FORMAT_TRACEABLE(volatile unsigned long, "%lu"); FORMAT_TRACEABLE(volatile long long, "%lld"); From 3f4f71ff9f119eb7dcded13dec32fa73ef1f485a Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 24 Jun 2019 17:37:57 -0700 Subject: [PATCH 090/136] fdbrpc: Increment peerReferences correctly The constructor of FlowReceiver which handled reference counting peerReferences relied on calling a virtual method from constructor whose behaviour isn't correct. This patch, bubbles down result of that virtual method from derived constructor to base contructor. --- fdbrpc/FlowTransport.actor.cpp | 9 ++++----- fdbrpc/FlowTransport.h | 4 ++-- fdbrpc/fdbrpc.h | 22 +++++++++++++++------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index c20aa607a6..303699a750 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -1047,12 +1047,11 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) { return Endpoint(g_currentDeliveryPeerAddress, token); } -void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) { +void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) { if (FlowTransport::transport().isClient()) { IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false)); } - - if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return; + if (!isStream || !endpoint.getPrimaryAddress().isValid()) return; Peer* peer = self->getPeer(endpoint.getPrimaryAddress()); if(peer->peerReferences == -1) { peer->peerReferences = 1; @@ -1061,8 +1060,8 @@ void FlowTransport::addPeerReference( const Endpoint& endpoint, NetworkMessageRe } } -void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessageReceiver* receiver ) { - if (!receiver->isStream() || !endpoint.getPrimaryAddress().isValid()) return; +void FlowTransport::removePeerReference(const Endpoint& endpoint, bool isStream) { + if (!isStream || !endpoint.getPrimaryAddress().isValid()) return; Peer* peer = self->getPeer(endpoint.getPrimaryAddress(), false); if(peer) { peer->peerReferences--; diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index 5bda279de3..73425b4ec6 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -132,10 +132,10 @@ public: std::map>* getIncompatiblePeers(); // Returns the same of all peers that have attempted to connect, but have incompatible protocol versions - void addPeerReference( const Endpoint&, NetworkMessageReceiver* ); + void addPeerReference(const Endpoint&, bool isStream); // Signal that a peer connection is being used, even if no messages are currently being sent to the peer - void removePeerReference( const Endpoint&, NetworkMessageReceiver* ); + void removePeerReference(const Endpoint&, bool isStream); // Signal that a peer connection is no longer being used void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 75e0a9a551..08c544ab7d 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -31,15 +31,19 @@ struct FlowReceiver : private NetworkMessageReceiver { // Common endpoint code for NetSAV<> and NetNotifiedQueue<> - FlowReceiver() : m_isLocalEndpoint(false) {} - FlowReceiver(Endpoint const& remoteEndpoint) : endpoint(remoteEndpoint), m_isLocalEndpoint(false) { - FlowTransport::transport().addPeerReference(endpoint, this); + FlowReceiver() : m_isLocalEndpoint(false), m_stream(false) { } + + FlowReceiver(Endpoint const& remoteEndpoint, bool stream) + : endpoint(remoteEndpoint), m_isLocalEndpoint(false), m_stream(stream) { + FlowTransport::transport().addPeerReference(endpoint, m_stream); + } + ~FlowReceiver() { if (m_isLocalEndpoint) { FlowTransport::transport().removeEndpoint(endpoint, this); } else { - FlowTransport::transport().removePeerReference(endpoint, this); + FlowTransport::transport().removePeerReference(endpoint, m_stream); } } @@ -63,9 +67,10 @@ struct FlowReceiver : private NetworkMessageReceiver { FlowTransport::transport().addWellKnownEndpoint(endpoint, this, taskID); } -protected: +private: Endpoint endpoint; bool m_isLocalEndpoint; + bool m_stream; }; template @@ -74,7 +79,9 @@ struct NetSAV : SAV, FlowReceiver, FastAllocated> { using FastAllocated>::operator delete; NetSAV(int futures, int promises) : SAV(futures, promises) {} - NetSAV(int futures, int promises, const Endpoint& remoteEndpoint) : SAV(futures, promises), FlowReceiver(remoteEndpoint) {} + NetSAV(int futures, int promises, const Endpoint& remoteEndpoint) + : SAV(futures, promises), FlowReceiver(remoteEndpoint, false) { + } virtual void destroy() { delete this; } virtual void receive(ArenaReader& reader) { @@ -228,7 +235,8 @@ struct NetNotifiedQueue : NotifiedQueue, FlowReceiver, FastAllocated>::operator delete; NetNotifiedQueue(int futures, int promises) : NotifiedQueue(futures, promises) {} - NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint) : NotifiedQueue(futures, promises), FlowReceiver(remoteEndpoint) {} + NetNotifiedQueue(int futures, int promises, const Endpoint& remoteEndpoint) + : NotifiedQueue(futures, promises), FlowReceiver(remoteEndpoint, true) {} virtual void destroy() { delete this; } virtual void receive(ArenaReader& reader) { From 78a1b2defc033b0a1693589baafae65fba13c2c6 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 25 Jun 2019 16:25:42 -0700 Subject: [PATCH 091/136] simulator: Destroy each process individually in its context When simulation ends, all the actors are cancelled, and the destructions which rely on `globals` may not have access to right globals (instead of the default simulator process globals). This patch, calls destroy on each process individually after we context switch to that process so that the globals acceses in destructor are its own. This issue arised when trying to get `Peer::peerReferences` in NetNotifiedQueue, resulting in decrementing the reference count of peers in FlowTransport object of '0.0.0.0'. --- fdbserver/SimulatedCluster.actor.cpp | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 705f401d39..fe7c5a1ac2 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1373,6 +1373,28 @@ void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication, i ifs.close(); } +// To be called after we stop simulator, so that destructors of each process is +// called with right context, with access to right globals. At this point, we +// also no longer have to protect coordinator addresses. +// TODO: Investigate why this doesn't work when we call before stop(). Some +// earlier permanently failed processes seems to be the reason. +ACTOR Future destroyAllProcesses() { + state ISimulator::ProcessInfo* simProcess = g_simulator.getCurrentProcess(); + state vector processes = g_simulator.getAllProcesses(); + state std::vector::iterator it; + + g_simulator.protectedAddresses.clear(); + for (it = processes.begin(); it != processes.end(); ++it) { + if (*it == simProcess || (*it)->failed) continue; + wait (g_simulator.onProcess(*it, TaskPriority::DefaultYield)); + (*it)->shutdownSignal.send(ISimulator::KillInstantly); + g_simulator.destroyProcess(*it); + } + + wait (g_simulator.onProcess(simProcess, TaskPriority::DefaultYield)); + return Void(); +} + ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, bool restoring, std::string whitelistBinPaths, Reference tlsOptions) { state vector> systemActors; state Optional connFile; @@ -1427,8 +1449,8 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot } TraceEvent("SimulatedSystemDestruct"); - destructed = true; - systemActors.clear(); - g_simulator.stop(); + destructed = true; + wait(destroyAllProcesses()); + systemActors.clear(); } From 7647d3e3c0f303b1b1a8fa9f716263ff0fe403bc Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 28 Jun 2019 00:38:28 -0700 Subject: [PATCH 092/136] fdbrpc: Don't use RequestStream for pings in ConnectionMonitor RequestStream add another count to peerReference, which means as long as ConnectionMonitor is alive, we'll never get peerReference=0 keeping unnecessary connections potentially alive. --- fdbrpc/FlowTransport.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 303699a750..752f6426cd 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -396,7 +396,7 @@ struct Peer : NonCopyable { } ACTOR static Future connectionMonitor( Peer *peer ) { - state RequestStream< ReplyPromise > remotePing( Endpoint( {peer->destination}, WLTOKEN_PING_PACKET ) ); + state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); loop { if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) { @@ -408,7 +408,7 @@ struct Peer : NonCopyable { // SOMEDAY: Stop monitoring and close the connection after a long period of inactivity with no reliable or onDisconnect requests outstanding state ReplyPromise reply; - FlowTransport::transport().sendUnreliable( SerializeSource>(reply), remotePing.getEndpoint() ); + FlowTransport::transport().sendUnreliable( SerializeSource>(reply), remotePingEndpoint ); state int64_t startingBytes = peer->bytesReceived; state int timeouts = 0; loop { From 867986cdeae0b712e414b89ce139b81195346ef6 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 28 Jun 2019 00:39:51 -0700 Subject: [PATCH 093/136] fdbrpc: Reduced connection monitoring from clients This patch does two changes to connection monitoring: 1. Connection monitoring at client side will check if the connection has been stayed idle for some time. If connection is unused for a while, we close the connection. There is some weirdness involved here as ping messages are by themselves are connection traffic. We get over this by making it two-phase process, first being checking idle reliable traffic, followed by disabling pings and then checking for idle unreliable traffic. 2. Connection monitoring of clients from server will no longer send pings to clients. Instead, it keep monitor the received bytes and close after certain period of inactivity. --- fdbrpc/FlowTransport.actor.cpp | 74 ++++++++++++++++++++++++++++------ fdbrpc/sim2.actor.cpp | 9 ++++- flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + flow/error_definitions.h | 1 + 5 files changed, 72 insertions(+), 14 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 752f6426cd..78d164cdd1 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -305,15 +305,18 @@ struct Peer : NonCopyable { int peerReferences; bool incompatibleProtocolVersionNewer; int64_t bytesReceived; + double lastSentTime; explicit Peer( TransportData* transport, NetworkAddress const& destination ) : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), - compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0) + compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastSentTime(now()) { connect = connectionKeeper(this); } void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) { + if (rp) + lastSentTime = now(); unsent.setWriteBuffer(pb); if (rp) reliable.insert(rp); if (firstUnsent) dataToSend.trigger(); @@ -396,17 +399,47 @@ struct Peer : NonCopyable { } ACTOR static Future connectionMonitor( Peer *peer ) { - state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); + if (!peer->destination.isPublic()) { + // Don't send ping messages to clients. Instead monitor incoming client pings. + state double lastRefreshed = now(); + state int64_t lastBytesReceived = peer->bytesReceived; + loop { + wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); + if (lastBytesReceived < peer->bytesReceived) { + lastRefreshed = now(); + lastBytesReceived = peer->bytesReceived; + } else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT*2.5) { + throw connection_idle(); + } + } + } + state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); loop { - if(peer->peerReferences == 0 && peer->reliable.empty() && peer->unsent.empty()) { + const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty(); + + if (peer->peerReferences == 0 && pendingPacketsEmpty) { throw connection_unreferenced(); } - wait( delayJittered( FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME ) ); + // TODO: Investigate connection idling at server-side peer too. + const bool monitorStateActive = peer->destination.isPublic() && + (peer->lastSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && + (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT); + if (!monitorStateActive) { + choose { + when(wait(peer->dataToSend.onTrigger())){ + peer->lastSentTime = now(); + } + when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT))) { + throw connection_idle(); + } + } + } - // SOMEDAY: Stop monitoring and close the connection after a long period of inactivity with no reliable or onDisconnect requests outstanding + wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); + // TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding state ReplyPromise reply; FlowTransport::transport().sendUnreliable( SerializeSource>(reply), remotePingEndpoint ); state int64_t startingBytes = peer->bytesReceived; @@ -414,12 +447,17 @@ struct Peer : NonCopyable { loop { choose { when (wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) )) { + // TODO: Since server will not ping clients (but will respond to incoming pings), is this + // a safe metric, or instead we should fail after multiple timeouts? if(startingBytes == peer->bytesReceived) { TraceEvent("ConnectionTimeout").suppressFor(1.0).detail("WithAddr", peer->destination); throw connection_failed(); } if(timeouts > 1) { - TraceEvent(SevWarnAlways, "ConnectionSlowPing").suppressFor(1.0).detail("WithAddr", peer->destination).detail("Timeouts", timeouts); + TraceEvent(SevWarnAlways, "ConnectionSlowPing") + .suppressFor(1.0) + .detail("WithAddr", peer->destination) + .detail("Timeouts", timeouts); } startingBytes = peer->bytesReceived; timeouts++; @@ -550,14 +588,21 @@ struct Peer : NonCopyable { self->discardUnreliablePackets(); reader = Future(); bool ok = e.code() == error_code_connection_failed || e.code() == error_code_actor_cancelled || - e.code() == error_code_connection_unreferenced || + e.code() == error_code_connection_unreferenced || e.code() == error_code_connection_idle || (g_network->isSimulated() && e.code() == error_code_checksum_failed); if(self->compatible) { - TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination); + TraceEvent(ok ? SevInfo : SevWarnAlways, "ConnectionClosed", conn ? conn->getDebugID() : UID()) + .error(e, true) + .suppressFor(1.0) + .detail("PeerAddr", self->destination); } else { - TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", conn ? conn->getDebugID() : UID()).error(e, true).suppressFor(1.0).detail("PeerAddr", self->destination); + TraceEvent(ok ? SevInfo : SevWarnAlways, "IncompatibleConnectionClosed", + conn ? conn->getDebugID() : UID()) + .error(e, true) + .suppressFor(1.0) + .detail("PeerAddr", self->destination); } if(self->destination.isPublic() && IFailureMonitor::failureMonitor().getState(self->destination).isAvailable()) { @@ -565,20 +610,25 @@ struct Peer : NonCopyable { if(now() - it.second > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY) { it.first = now(); } else if(now() - it.first > FLOW_KNOBS->TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT) { - TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()).suppressFor(5.0).detail("PeerAddr", self->destination); + TraceEvent(SevWarnAlways, "TooManyConnectionsClosed", conn ? conn->getDebugID() : UID()) + .suppressFor(5.0) + .detail("PeerAddr", self->destination); self->transport->degraded->set(true); } it.second = now(); } if (conn) { - if (FlowTransport::transport().isClient()) { + if (FlowTransport::transport().isClient() && e.code() != error_code_connection_idle) { clientReconnectDelay = true; } conn->close(); conn = Reference(); } - IFailureMonitor::failureMonitor().notifyDisconnect( self->destination ); //< Clients might send more packets in response, which needs to go out on the next connection + + // Clients might send more packets in response, which needs to go out on the next connection + IFailureMonitor::failureMonitor().notifyDisconnect( self->destination ); + if (e.code() == error_code_actor_cancelled) throw; // Try to recover, even from serious errors, by retrying diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index a7ee2623e9..42e3e32930 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -381,8 +381,13 @@ private: ACTOR static Future trackLeakedConnection( Sim2Conn* self ) { wait( g_simulator.onProcess( self->process ) ); // SOMEDAY: Make this value variable? Dependent on buggification status? - wait( delay( 20.0 ) ); - TraceEvent(SevError, "LeakedConnection", self->dbgid).error(connection_leaked()).detail("MyAddr", self->process->address).detail("PeerAddr", self->peerEndpoint).detail("PeerId", self->peerId).detail("Opened", self->opened); + wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 4 ) ); + TraceEvent(SevError, "LeakedConnection", self->dbgid) + .error(connection_leaked()) + .detail("MyAddr", self->process->address) + .detail("PeerAddr", self->peerEndpoint) + .detail("PeerId", self->peerId) + .detail("Opened", self->opened); return Void(); } }; diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index a79dbed1ca..db6d24cb67 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -55,6 +55,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { //connectionMonitor init( CONNECTION_MONITOR_LOOP_TIME, isSimulated ? 0.75 : 1.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_LOOP_TIME = 6.0; init( CONNECTION_MONITOR_TIMEOUT, isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0; + init( CONNECTION_MONITOR_IDLE_TIMEOUT, 10.0 ); //FlowTransport init( CONNECTION_REJECTED_MESSAGE_DELAY, 1.0 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 2268e6dfad..910bdec66f 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -73,6 +73,7 @@ public: //connectionMonitor double CONNECTION_MONITOR_LOOP_TIME; double CONNECTION_MONITOR_TIMEOUT; + double CONNECTION_MONITOR_IDLE_TIMEOUT; //FlowTransport double CONNECTION_REJECTED_MESSAGE_DELAY; diff --git a/flow/error_definitions.h b/flow/error_definitions.h index 25f000935b..a505641694 100755 --- a/flow/error_definitions.h +++ b/flow/error_definitions.h @@ -69,6 +69,7 @@ ERROR( transaction_not_permitted, 1045, "Operation not permitted") ERROR( cluster_not_fully_recovered, 1046, "Cluster not fully recovered") ERROR( txn_exec_log_anti_quorum, 1047, "Execute Transaction not supported when log anti quorum is configured") ERROR( connection_unreferenced, 1048, "No peer references for connection" ) +ERROR( connection_idle, 1049, "Connection closed after idle timeout" ) ERROR( broken_promise, 1100, "Broken promise" ) ERROR( operation_cancelled, 1101, "Asynchronous operation cancelled" ) From ae6c3e013a27b4ca559c2db55abe82a9817ad625 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 28 Jun 2019 13:18:49 -0700 Subject: [PATCH 094/136] monitorClientInfo: Wait for master proxy endpoint failures than triggers This will not initiate request to get get new set of proxy unless we know for a fact that endpoint has indeed failed, not just because the connection to Peer was closed as it was sitting idle. --- fdbclient/NativeAPI.actor.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 8ada99503f..9b2685508f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -592,13 +592,13 @@ ACTOR static Future monitorClientInfo( Reference Date: Fri, 28 Jun 2019 13:43:22 -0700 Subject: [PATCH 095/136] fdbrpc: Instead of tracking last sent data, track last sent non-ping data * This will allow client to continue monitoring peer connections while connection stays open, so that there is no period of "uncertainity" without previous no-monitoring approach. * Use multiplier for incoming connection idle timeout * Update idle connection timeout values and leaked connection timeout in simulator. --- fdbrpc/FlowTransport.actor.cpp | 49 +++++++++++++--------------------- fdbrpc/sim2.actor.cpp | 6 ++++- flow/Knobs.cpp | 3 ++- flow/Knobs.h | 1 + 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 78d164cdd1..5392cbede2 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -305,18 +305,16 @@ struct Peer : NonCopyable { int peerReferences; bool incompatibleProtocolVersionNewer; int64_t bytesReceived; - double lastSentTime; + double lastDataPacketSentTime; - explicit Peer( TransportData* transport, NetworkAddress const& destination ) - : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), - compatible(true), incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastSentTime(now()) - { + explicit Peer(TransportData* transport, NetworkAddress const& destination) + : transport(transport), destination(destination), outgoingConnectionIdle(false), lastConnectTime(0.0), + reconnectionDelay(FLOW_KNOBS->INITIAL_RECONNECTION_TIME), compatible(true), + incompatibleProtocolVersionNewer(false), peerReferences(-1), bytesReceived(0), lastDataPacketSentTime(now()) { connect = connectionKeeper(this); } void send(PacketBuffer* pb, ReliablePacket* rp, bool firstUnsent) { - if (rp) - lastSentTime = now(); unsent.setWriteBuffer(pb); if (rp) reliable.insert(rp); if (firstUnsent) dataToSend.trigger(); @@ -408,7 +406,8 @@ struct Peer : NonCopyable { if (lastBytesReceived < peer->bytesReceived) { lastRefreshed = now(); lastBytesReceived = peer->bytesReceived; - } else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT*2.5) { + } else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * + FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) { throw connection_idle(); } } @@ -417,28 +416,14 @@ struct Peer : NonCopyable { state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); loop { const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty(); - - if (peer->peerReferences == 0 && pendingPacketsEmpty) { - throw connection_unreferenced(); + if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && + (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) { + if (peer->peerReferences == 0) + throw connection_unreferenced(); + else if (peer->destination.isPublic()) + throw connection_idle(); } - // TODO: Investigate connection idling at server-side peer too. - const bool monitorStateActive = peer->destination.isPublic() && - (peer->lastSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && - (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT); - if (!monitorStateActive) { - choose { - when(wait(peer->dataToSend.onTrigger())){ - peer->lastSentTime = now(); - } - when(wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT))) { - throw connection_idle(); - } - } - } - - wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); - // TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding state ReplyPromise reply; FlowTransport::transport().sendUnreliable( SerializeSource>(reply), remotePingEndpoint ); @@ -447,8 +432,6 @@ struct Peer : NonCopyable { loop { choose { when (wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_TIMEOUT ) )) { - // TODO: Since server will not ping clients (but will respond to incoming pings), is this - // a safe metric, or instead we should fail after multiple timeouts? if(startingBytes == peer->bytesReceived) { TraceEvent("ConnectionTimeout").suppressFor(1.0).detail("WithAddr", peer->destination); throw connection_failed(); @@ -470,6 +453,8 @@ struct Peer : NonCopyable { } } } + + wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); } } @@ -1262,7 +1247,9 @@ static PacketID sendPacket( TransportData* self, ISerializeSource const& what, c #endif peer->send(pb, rp, firstUnsent); - + if (destination.token != WLTOKEN_PING_PACKET) { + peer->lastDataPacketSentTime = now(); + } return (PacketID)rp; } } diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 42e3e32930..92402fd56a 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -381,7 +381,11 @@ private: ACTOR static Future trackLeakedConnection( Sim2Conn* self ) { wait( g_simulator.onProcess( self->process ) ); // SOMEDAY: Make this value variable? Dependent on buggification status? - wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 4 ) ); + if (self->process->address.isPublic()) { + wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) ); + } else { + wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) ); + } TraceEvent(SevError, "LeakedConnection", self->dbgid) .error(connection_leaked()) .detail("MyAddr", self->process->address) diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index db6d24cb67..12d9b25cc1 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -55,7 +55,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { //connectionMonitor init( CONNECTION_MONITOR_LOOP_TIME, isSimulated ? 0.75 : 1.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_LOOP_TIME = 6.0; init( CONNECTION_MONITOR_TIMEOUT, isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0; - init( CONNECTION_MONITOR_IDLE_TIMEOUT, 10.0 ); + init( CONNECTION_MONITOR_IDLE_TIMEOUT, 180.0 ); + init( CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER, 1.2 ); //FlowTransport init( CONNECTION_REJECTED_MESSAGE_DELAY, 1.0 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 910bdec66f..c2adec0bd6 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -74,6 +74,7 @@ public: double CONNECTION_MONITOR_LOOP_TIME; double CONNECTION_MONITOR_TIMEOUT; double CONNECTION_MONITOR_IDLE_TIMEOUT; + double CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER; //FlowTransport double CONNECTION_REJECTED_MESSAGE_DELAY; From 22678267cdb8fa519866c52dd91c20d82f54df30 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Fri, 5 Jul 2019 16:27:17 -0700 Subject: [PATCH 096/136] fdbrpc: Don't drop idle connections from server Instead try pinging the client and let that decide whether the client is alive or not. Ideally, it should always be failed since a well behaved client would have closed the connection. --- fdbrpc/FlowTransport.actor.cpp | 44 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 5392cbede2..605edbbfc3 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -397,33 +397,39 @@ struct Peer : NonCopyable { } ACTOR static Future connectionMonitor( Peer *peer ) { - if (!peer->destination.isPublic()) { - // Don't send ping messages to clients. Instead monitor incoming client pings. - state double lastRefreshed = now(); - state int64_t lastBytesReceived = peer->bytesReceived; - loop { - wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); - if (lastBytesReceived < peer->bytesReceived) { - lastRefreshed = now(); - lastBytesReceived = peer->bytesReceived; - } else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * - FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) { - throw connection_idle(); - } - } - } - state Endpoint remotePingEndpoint({ peer->destination }, WLTOKEN_PING_PACKET); loop { + if (!FlowTransport::transport().isClient() && !peer->destination.isPublic()) { + // Don't send ping messages to clients unless necessary. Instead monitor incoming client pings. + state double lastRefreshed = now(); + state int64_t lastBytesReceived = peer->bytesReceived; + loop { + wait(delay(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); + if (lastBytesReceived < peer->bytesReceived) { + lastRefreshed = now(); + lastBytesReceived = peer->bytesReceived; + } else if (lastRefreshed < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * + FLOW_KNOBS->CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER) { + // If we have not received anything in this period, client must have closed + // connection by now. Break loop to check if it is still alive by sending a ping. + break; + } + } + } + const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty(); if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) { - if (peer->peerReferences == 0) + if (peer->peerReferences == 0) { throw connection_unreferenced(); - else if (peer->destination.isPublic()) + } else if (FlowTransport::transport().isClient() && peer->destination.isPublic()) { + // First condition is necessary because we may get here if we are server. throw connection_idle(); + } } + wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); + // TODO: Stop monitoring and close the connection with no onDisconnect requests outstanding state ReplyPromise reply; FlowTransport::transport().sendUnreliable( SerializeSource>(reply), remotePingEndpoint ); @@ -453,8 +459,6 @@ struct Peer : NonCopyable { } } } - - wait (delayJittered(FLOW_KNOBS->CONNECTION_MONITOR_LOOP_TIME)); } } From 983343978e5e25b1d6867b11e5c346cbc0b754b1 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Mon, 8 Jul 2019 19:13:53 -0700 Subject: [PATCH 097/136] fdbrpc: ConnectionMonitor should close unreferenced after delay Potentially for cases, where it goes up to 1 immediately. --- fdbrpc/FlowTransport.actor.cpp | 12 +++++++----- flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index 605edbbfc3..d0f69162aa 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -417,12 +417,14 @@ struct Peer : NonCopyable { } } - const bool pendingPacketsEmpty = peer->reliable.empty() && peer->unsent.empty(); - if (pendingPacketsEmpty && (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && - (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) { - if (peer->peerReferences == 0) { + if (peer->reliable.empty() && peer->unsent.empty()) { + if (peer->peerReferences == 0 && + (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY)) { + // TODO: What about when peerReference == -1? throw connection_unreferenced(); - } else if (FlowTransport::transport().isClient() && peer->destination.isPublic()) { + } else if (FlowTransport::transport().isClient() && peer->destination.isPublic() && + (peer->lastConnectTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT) && + (peer->lastDataPacketSentTime < now() - FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT)) { // First condition is necessary because we may get here if we are server. throw connection_idle(); } diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 12d9b25cc1..90f2e078bd 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -57,6 +57,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( CONNECTION_MONITOR_TIMEOUT, isSimulated ? 1.50 : 2.0 ); if( randomize && BUGGIFY ) CONNECTION_MONITOR_TIMEOUT = 6.0; init( CONNECTION_MONITOR_IDLE_TIMEOUT, 180.0 ); init( CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER, 1.2 ); + init( CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY, 2.0 ); //FlowTransport init( CONNECTION_REJECTED_MESSAGE_DELAY, 1.0 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index c2adec0bd6..99ac9df386 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -75,6 +75,7 @@ public: double CONNECTION_MONITOR_TIMEOUT; double CONNECTION_MONITOR_IDLE_TIMEOUT; double CONNECTION_MONITOR_INCOMING_IDLE_MULTIPLIER; + double CONNECTION_MONITOR_UNREFERENCED_CLOSE_DELAY; //FlowTransport double CONNECTION_REJECTED_MESSAGE_DELAY; From 2f29b2c3d13d832edd1d8cdb31a9d3db5a30ec75 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 9 Jul 2019 14:01:05 -0700 Subject: [PATCH 098/136] simulator: Just do a wait() in setupAndRun to avoid destruction It get us out of the ACTOR, never clearing the systemActors, and let simulator call exit(). --- fdbserver/SimulatedCluster.actor.cpp | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index fe7c5a1ac2..44a80342ab 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1373,28 +1373,6 @@ void checkExtraDB(const char *testFile, int &extraDB, int &minimumReplication, i ifs.close(); } -// To be called after we stop simulator, so that destructors of each process is -// called with right context, with access to right globals. At this point, we -// also no longer have to protect coordinator addresses. -// TODO: Investigate why this doesn't work when we call before stop(). Some -// earlier permanently failed processes seems to be the reason. -ACTOR Future destroyAllProcesses() { - state ISimulator::ProcessInfo* simProcess = g_simulator.getCurrentProcess(); - state vector processes = g_simulator.getAllProcesses(); - state std::vector::iterator it; - - g_simulator.protectedAddresses.clear(); - for (it = processes.begin(); it != processes.end(); ++it) { - if (*it == simProcess || (*it)->failed) continue; - wait (g_simulator.onProcess(*it, TaskPriority::DefaultYield)); - (*it)->shutdownSignal.send(ISimulator::KillInstantly); - g_simulator.destroyProcess(*it); - } - - wait (g_simulator.onProcess(simProcess, TaskPriority::DefaultYield)); - return Void(); -} - ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool rebooting, bool restoring, std::string whitelistBinPaths, Reference tlsOptions) { state vector> systemActors; state Optional connFile; @@ -1451,6 +1429,6 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot TraceEvent("SimulatedSystemDestruct"); g_simulator.stop(); destructed = true; - wait(destroyAllProcesses()); - systemActors.clear(); + wait(Never()); + ASSERT(false); } From fdd580c8788b4d4b27a90908ee8c05c9a9bf59e6 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 9 Jul 2019 15:00:11 -0700 Subject: [PATCH 099/136] Restore some variable initializations that were unintentionally removed. --- fdbclient/NativeAPI.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index b9cea4c1ad..41be26ccfe 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -514,7 +514,7 @@ DatabaseContext::DatabaseContext( transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), - transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), + transactionsProcessBehind("ProcessBehind", cc), transactionWaitsForFullRecovery("WaitsForFullRecovery", cc), outstandingWatches(0), transactionTimeout(0.0), transactionMaxRetries(-1), latencies(1000), readLatencies(1000), commitLatencies(1000), GRVLatencies(1000), mutationsPerCommit(1000), bytesPerCommit(1000), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), internal(internal) { From 4b8eb27134ee242a97f6451f14b6dcaef6529896 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Tue, 9 Jul 2019 14:57:38 -0700 Subject: [PATCH 100/136] fdbrpc: Move setStatus line in addPeerReference --- fdbrpc/FlowTransport.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index d0f69162aa..c0e78e9a32 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -1089,10 +1089,11 @@ Endpoint FlowTransport::loadedEndpoint( const UID& token ) { } void FlowTransport::addPeerReference(const Endpoint& endpoint, bool isStream) { - if (FlowTransport::transport().isClient()) { + if (!isStream || !endpoint.getPrimaryAddress().isValid()) + return; + else if (FlowTransport::transport().isClient()) IFailureMonitor::failureMonitor().setStatus(endpoint.getPrimaryAddress(), FailureStatus(false)); - } - if (!isStream || !endpoint.getPrimaryAddress().isValid()) return; + Peer* peer = self->getPeer(endpoint.getPrimaryAddress()); if(peer->peerReferences == -1) { peer->peerReferences = 1; From c8cf7f88ef5cd9b4b8c569c7c49cf09a818071b9 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 9 Jul 2019 15:25:32 -0700 Subject: [PATCH 101/136] Add a release note for option fix, in particular noting that a 6.2 client must be used as the primary for the behavior to work correctly. --- documentation/sphinx/source/release-notes.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index f2a9813030..bb59d59559 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -14,6 +14,8 @@ Performance Fixes ----- +* During an upgrade, the multi-version client now persists database default options and transaction options that aren't reset on retry (e.g. transaction timeout). In order for these options to function correctly during an upgrade, a 6.2 or later client should be used as the primary client. `(PR #1767) `_. + Status ------ From d032d7fcf93748bd41bfd3ac089a2712a99de261 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 9 Jul 2019 16:37:54 -0700 Subject: [PATCH 102/136] fix: if we get a broken_promise from the actor, wait to get the real error from the store --- fdbserver/worker.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index f23d889c37..af948b9b87 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -129,6 +129,9 @@ ACTOR Future handleIOErrors( Future actor, IClosable* store, UID id, } else { wait(onClosed); } + if(e.isError() && e.getError().code() == error_code_broken_promise && !storeError.isReady()) { + wait(delay(0.00001 + FLOW_KNOBS->MAX_BUGGIFIED_DELAY)); + } if(storeError.isReady()) throw storeError.get().getError(); if (e.isError()) throw e.getError(); else return e.get(); } From b27a909f3af2aadc0b012a912683ee6dc84c2aa2 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 9 Jul 2019 16:38:59 -0700 Subject: [PATCH 103/136] fix: onDisconnectOrFailure can spuriously trigger --- fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 797aa85a13..d4af96c99d 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -251,7 +251,7 @@ Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { ACTOR Future serverPeekOnFailed( ILogSystem::ServerPeekCursor* self ) { loop { choose { - when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onDisconnectOrFailure( self->interf->get().interf().peekMessages.getEndpoint() ) : Never() ) ) { return Void(); } + when( wait( self->interf->get().present() ? IFailureMonitor::failureMonitor().onStateEqual( self->interf->get().interf().peekMessages.getEndpoint(), FailureStatus() ) : Never() ) ) { return Void(); } when( wait( self->interf->onChange() ) ) {} } } From 64aee73c4f3544216029d98e363e16dce518a63f Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 9 Jul 2019 16:47:56 -0700 Subject: [PATCH 104/136] we only need to hold the ReplyPromise for messages that we are going to forward to new proxies --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/MasterProxyServer.actor.cpp | 36 +++++++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 5f7f45ea19..2b4fb5e87f 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -283,6 +283,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( UPDATE_REMOTE_LOG_VERSION_INTERVAL, 2.0 ); init( MAX_TXS_POP_VERSION_HISTORY, 1e5 ); init( PROXY_FORWARD_DELAY, 10.0 ); + init( MAX_FORWARD_MESSAGES, 1e6 ); // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index f8517ababe..f0f11d5277 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -228,6 +228,7 @@ public: double UPDATE_REMOTE_LOG_VERSION_INTERVAL; int MAX_TXS_POP_VERSION_HISTORY; double PROXY_FORWARD_DELAY; + int MAX_FORWARD_MESSAGES; // Master Server double COMMIT_SLEEP_TIME; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 83cf0d51db..81b86ec2c0 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -1789,23 +1789,34 @@ ACTOR Future checkRemoved(Reference> db, uint64_t r } } -ACTOR Future forwardProxy(ClientDBInfo info, RequestStream commit, RequestStream getConsistentReadVersion, RequestStream getKeyServersLocations) { +ACTOR template Future stripRequests( RequestStream in, PromiseStream> out, int* count) { + loop { + X req = waitNext(in.getFuture()); + out.send(req.reply); + if((*count) >= 0 && ++(*count) >= SERVER_KNOBS->MAX_FORWARD_MESSAGES) { + TraceEvent(SevWarnAlways, "TooManyProxyForwardRequests"); + return Void(); + } + } +} + +ACTOR Future forwardProxy(ClientDBInfo info, PromiseStream> commitReplies, PromiseStream> grvReplies, PromiseStream> locationReplies) { loop { choose { - when(CommitTransactionRequest req = waitNext(commit.getFuture())) { + when(ReplyPromise req = waitNext(commitReplies.getFuture())) { CommitID rep; rep.newClientInfo = info; - req.reply.send(rep); + req.send(rep); } - when(GetReadVersionRequest req = waitNext(getConsistentReadVersion.getFuture())) { + when(ReplyPromise req = waitNext(grvReplies.getFuture())) { GetReadVersionReply rep; rep.newClientInfo = info; - req.reply.send(rep); + req.send(rep); } - when(GetKeyServerLocationsRequest req = waitNext(getKeyServersLocations.getFuture())) { + when(ReplyPromise req = waitNext(locationReplies.getFuture())) { GetKeyServerLocationsReply rep; rep.newClientInfo = info; - req.reply.send(rep); + req.send(rep); } } wait(yield()); @@ -1833,15 +1844,20 @@ ACTOR Future masterProxyServer( } } core.cancel(); - state Future finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY); + state PromiseStream> commitReplies; + state PromiseStream> grvReplies; + state PromiseStream> locationReplies; + state int replyCount = 0; + state Future finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequest(proxy.commit, commitReplies, &replyCount) || stripRequest(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequest(proxy.getKeyServersLocations, locationReplies, &replyCount); + proxy = MasterProxyInterface(); loop { if(finishForward.isReady()) { return Void(); } if(db->get().client.proxies.size() > 0 && !db->get().client.proxies[0].provisional && db->get().recoveryCount >= req.recoveryCount && !std::count(db->get().client.proxies.begin(), db->get().client.proxies.end(), proxy)) { - core = forwardProxy(db->get().client, proxy.commit, proxy.getConsistentReadVersion, proxy.getKeyServersLocations); - proxy = MasterProxyInterface(); + replyCount = -1; + core = forwardProxy(db->get().client, commitReplies, grvReplies, locationReplies); wait(finishForward); return Void(); } From 001abec29d7fa9445dacb7121b1db296106ee5ae Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 9 Jul 2019 16:50:59 -0700 Subject: [PATCH 105/136] fixed a compiler error, buggified a new knob --- fdbserver/Knobs.cpp | 2 +- fdbserver/MasterProxyServer.actor.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 2b4fb5e87f..5ea999b472 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -283,7 +283,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( UPDATE_REMOTE_LOG_VERSION_INTERVAL, 2.0 ); init( MAX_TXS_POP_VERSION_HISTORY, 1e5 ); init( PROXY_FORWARD_DELAY, 10.0 ); - init( MAX_FORWARD_MESSAGES, 1e6 ); + init( MAX_FORWARD_MESSAGES, 1e6 ); if( randomize && BUGGIFY ) MAX_FORWARD_MESSAGES = 10; // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 81b86ec2c0..07c3bbc931 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -1848,7 +1848,7 @@ ACTOR Future masterProxyServer( state PromiseStream> grvReplies; state PromiseStream> locationReplies; state int replyCount = 0; - state Future finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequest(proxy.commit, commitReplies, &replyCount) || stripRequest(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequest(proxy.getKeyServersLocations, locationReplies, &replyCount); + state Future finishForward = delay(SERVER_KNOBS->PROXY_FORWARD_DELAY) || stripRequests(proxy.commit, commitReplies, &replyCount) || stripRequests(proxy.getConsistentReadVersion, grvReplies, &replyCount) || stripRequests(proxy.getKeyServersLocations, locationReplies, &replyCount); proxy = MasterProxyInterface(); loop { if(finishForward.isReady()) { From a53bf9289ae31393f8b37efb92474fefb9beddf7 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 9 Jul 2019 17:13:24 -0700 Subject: [PATCH 106/136] remove SnapTestAttrition because it is causing correctness errors --- .../from_6.2.0/SnapTestAttrition-1.txt | 45 ------------------- .../from_6.2.0/SnapTestAttrition-2.txt | 7 --- 2 files changed, 52 deletions(-) delete mode 100644 tests/restarting/from_6.2.0/SnapTestAttrition-1.txt delete mode 100644 tests/restarting/from_6.2.0/SnapTestAttrition-2.txt diff --git a/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt b/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt deleted file mode 100644 index d3ceed1584..0000000000 --- a/tests/restarting/from_6.2.0/SnapTestAttrition-1.txt +++ /dev/null @@ -1,45 +0,0 @@ -testTitle=SnapTestPre -;write 1000 Keys ending with even numbers - testName=SnapTest - numSnaps=1 - maxSnapDelay=3.0 - testID=0 - clearAfterTest=false - -testTitle=SnapTestTakeSnap -;Take snap and do read/write - testName=ReadWrite - testDuration=10.0 - transactionsPerSecond=10000 - writesPerTransactionA=0 - readsPerTransactionA=10 - writesPerTransactionB=10 - readsPerTransactionB=1 - alpha=0.5 - nodeCount=100000 - valueBytes=16 - discardEdgeMeasurements=false - - testName=SnapTest - numSnaps=1 - maxSnapDelay=10.0 - testID=1 - clearAfterTest=false - - testName=Attrition - testDuration=10.0 - -testTitle=SnapTestPost -;write 1000 Keys ending with odd numbers - testName=SnapTest - numSnaps=1 - maxSnapDelay=25.0 - testID=2 - clearAfterTest=false - -; save and shutdown -testTitle=SnapSimpleShutdown - testName=SaveAndKill - restartInfoLocation=simfdb/restartInfo.ini - testDuration=10.0 - isRestoring=1 diff --git a/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt b/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt deleted file mode 100644 index 07d71073e1..0000000000 --- a/tests/restarting/from_6.2.0/SnapTestAttrition-2.txt +++ /dev/null @@ -1,7 +0,0 @@ -; verify all keys are even numbered -testTitle=SnapTestVerify -testName=SnapTest -numSnaps=1 -maxSnapDelay=3.0 -testID=3 -restartInfoLocation=simfdb/restartInfo.ini From 38ae352fc55e887d5f33e18b2f0e607e2457d951 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 10 Jul 2019 09:46:23 -0700 Subject: [PATCH 107/136] Fix a merge issue --- flow/SystemMonitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/SystemMonitor.cpp b/flow/SystemMonitor.cpp index 4481cba0f0..c391c93db1 100644 --- a/flow/SystemMonitor.cpp +++ b/flow/SystemMonitor.cpp @@ -145,7 +145,7 @@ SystemStatistics customSystemMonitor(std::string eventName, StatisticsState *sta } } - for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != 0; i++) { + for (int i = 0; i < NetworkMetrics::PRIORITY_BINS && g_network->networkMetrics.priorityBins[i] != TaskPriority::Zero; i++) { if(g_network->networkMetrics.priorityBlocked[i]) { double lastSegment = std::min(currentStats.elapsed, now() - g_network->networkMetrics.priorityTimer[i]); g_network->networkMetrics.priorityBlockedDuration[i] += lastSegment; From c694931e33ce4daaedf73c03947fc66763f5a801 Mon Sep 17 00:00:00 2001 From: Vishesh Yadav Date: Wed, 10 Jul 2019 14:06:06 -0700 Subject: [PATCH 108/136] sim2: Remove obsolete comment --- fdbrpc/sim2.actor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index 92402fd56a..e71959d831 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -380,7 +380,6 @@ private: ACTOR static Future trackLeakedConnection( Sim2Conn* self ) { wait( g_simulator.onProcess( self->process ) ); - // SOMEDAY: Make this value variable? Dependent on buggification status? if (self->process->address.isPublic()) { wait( delay( FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * FLOW_KNOBS->CONNECTION_MONITOR_IDLE_TIMEOUT * 1.5 ) ); } else { From b4dbc6d7fad9c932981ee3cb3c13fa6453435dda Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 10 Jul 2019 14:43:20 -0700 Subject: [PATCH 109/136] Change the way cache hits and misses are tracked to avoid counting blind page writes as misses and count the results of partial page writes. Report cache hit rate in status. --- .../source/mr-status-json-schemas.rst.inc | 4 ++ fdbclient/Schemas.cpp | 4 ++ fdbrpc/AsyncFileCached.actor.h | 62 +++++++++++------ fdbserver/Status.actor.cpp | 68 +++++++++++++++++++ flow/SystemMonitor.h | 4 +- 5 files changed, 119 insertions(+), 23 deletions(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index ed07092ab1..1ba9d1652d 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -297,6 +297,10 @@ } ] }, + "page_cache":{ + "log_hit_rate":0.5, + "storage_hit_rate":0.5 + }, "messages":[ { "reasons":[ diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index ec4488741e..9eebf116dc 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -319,6 +319,10 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( } ] }, + "page_cache":{ + "log_hit_rate":0.5, + "storage_hit_rate":0.5 + }, "messages":[ { "reasons":[ diff --git a/fdbrpc/AsyncFileCached.actor.h b/fdbrpc/AsyncFileCached.actor.h index 031ba79ab9..d9b192b662 100644 --- a/fdbrpc/AsyncFileCached.actor.h +++ b/fdbrpc/AsyncFileCached.actor.h @@ -67,8 +67,6 @@ struct EvictablePageCache : ReferenceCounted { EvictablePageCache() : pageSize(0), maxPages(0), cacheEvictionType(RANDOM) {} explicit EvictablePageCache(int pageSize, int64_t maxSize) : pageSize(pageSize), maxPages(maxSize / pageSize), cacheEvictionType(evictionPolicyStringToEnum(FLOW_KNOBS->CACHE_EVICTION_POLICY)) { - cacheHits.init(LiteralStringRef("EvictablePageCache.CacheHits")); - cacheMisses.init(LiteralStringRef("EvictablePageCache.CacheMisses")); cacheEvictions.init(LiteralStringRef("EvictablePageCache.CacheEvictions")); } @@ -82,7 +80,6 @@ struct EvictablePageCache : ReferenceCounted { } else { lruPages.push_back(*page); // new page is considered the most recently used (placed at LRU tail) } - ++cacheMisses; } void updateHit(EvictablePage* page) { @@ -91,7 +88,6 @@ struct EvictablePageCache : ReferenceCounted { lruPages.erase(List::s_iterator_to(*page)); lruPages.push_back(*page); } - ++cacheHits; } void try_evict() { @@ -126,8 +122,6 @@ struct EvictablePageCache : ReferenceCounted { List lruPages; int pageSize; int64_t maxPages; - Int64MetricHandle cacheHits; - Int64MetricHandle cacheMisses; Int64MetricHandle cacheEvictions; const CacheEvictionType cacheEvictionType; }; @@ -278,6 +272,8 @@ private: Int64MetricHandle countFileCacheWrites; Int64MetricHandle countFileCacheReadsBlocked; Int64MetricHandle countFileCacheWritesBlocked; + Int64MetricHandle countFileCachePageReadsHit; + Int64MetricHandle countFileCachePageReadsMissed; Int64MetricHandle countFileCachePageReadsMerged; Int64MetricHandle countFileCacheReadBytes; @@ -286,28 +282,33 @@ private: Int64MetricHandle countCacheWrites; Int64MetricHandle countCacheReadsBlocked; Int64MetricHandle countCacheWritesBlocked; + Int64MetricHandle countCachePageReadsHit; + Int64MetricHandle countCachePageReadsMissed; Int64MetricHandle countCachePageReadsMerged; Int64MetricHandle countCacheReadBytes; - AsyncFileCached( Reference uncached, const std::string& filename, int64_t length, Reference pageCache ) + AsyncFileCached( Reference uncached, const std::string& filename, int64_t length, Reference pageCache ) : uncached(uncached), filename(filename), length(length), prevLength(length), pageCache(pageCache), currentTruncate(Void()), currentTruncateSize(0) { if( !g_network->isSimulated() ) { - countFileCacheWrites.init( LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename); - countFileCacheReads.init( LiteralStringRef("AsyncFile.CountFileCacheReads"), filename); - countFileCacheWritesBlocked.init( LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename); - countFileCacheReadsBlocked.init( LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename); + countFileCacheWrites.init(LiteralStringRef("AsyncFile.CountFileCacheWrites"), filename); + countFileCacheReads.init(LiteralStringRef("AsyncFile.CountFileCacheReads"), filename); + countFileCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheWritesBlocked"), filename); + countFileCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountFileCacheReadsBlocked"), filename); + countFileCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsHit"), filename); + countFileCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMissed"), filename); countFileCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountFileCachePageReadsMerged"), filename); - countFileCacheFinds.init( LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename); - countFileCacheReadBytes.init( LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename); + countFileCacheFinds.init(LiteralStringRef("AsyncFile.CountFileCacheFinds"), filename); + countFileCacheReadBytes.init(LiteralStringRef("AsyncFile.CountFileCacheReadBytes"), filename); - countCacheWrites.init( LiteralStringRef("AsyncFile.CountCacheWrites")); - countCacheReads.init( LiteralStringRef("AsyncFile.CountCacheReads")); - countCacheWritesBlocked.init( LiteralStringRef("AsyncFile.CountCacheWritesBlocked")); - countCacheReadsBlocked.init( LiteralStringRef("AsyncFile.CountCacheReadsBlocked")); + countCacheWrites.init(LiteralStringRef("AsyncFile.CountCacheWrites")); + countCacheReads.init(LiteralStringRef("AsyncFile.CountCacheReads")); + countCacheWritesBlocked.init(LiteralStringRef("AsyncFile.CountCacheWritesBlocked")); + countCacheReadsBlocked.init(LiteralStringRef("AsyncFile.CountCacheReadsBlocked")); + countCachePageReadsHit.init(LiteralStringRef("AsyncFile.CountCachePageReadsHit")); + countCachePageReadsMissed.init(LiteralStringRef("AsyncFile.CountCachePageReadsMissed")); countCachePageReadsMerged.init(LiteralStringRef("AsyncFile.CountCachePageReadsMerged")); - countCacheFinds.init( LiteralStringRef("AsyncFile.CountCacheFinds")); - countCacheReadBytes.init( LiteralStringRef("AsyncFile.CountCacheReadBytes")); - + countCacheFinds.init(LiteralStringRef("AsyncFile.CountCacheFinds")); + countCacheReadBytes.init(LiteralStringRef("AsyncFile.CountCacheReadBytes")); } } @@ -387,11 +388,18 @@ struct AFCPage : public EvictablePage, public FastAllocated { // If there are no active readers then if data is valid or we're replacing all of it we can write directly if (valid || fullPage) { + if(!fullPage) { + ++owner->countFileCachePageReadsHit; + ++owner->countCachePageReadsHit; + } valid = true; memcpy( static_cast(this->data) + offset, data, length ); return yield(); } + ++owner->countFileCachePageReadsMissed; + ++owner->countCachePageReadsMissed; + // If data is not valid but no read is in progress, start reading if (notReading.isReady()) { notReading = readThrough( this ); @@ -410,7 +418,14 @@ struct AFCPage : public EvictablePage, public FastAllocated { Future readZeroCopy() { ++zeroCopyRefCount; - if (valid) return yield(); + if (valid) { + ++owner->countFileCachePageReadsHit; + ++owner->countCachePageReadsHit; + return yield(); + } + + ++owner->countFileCachePageReadsMissed; + ++owner->countCachePageReadsMissed; if (notReading.isReady()) { notReading = readThrough( this ); @@ -428,12 +443,17 @@ struct AFCPage : public EvictablePage, public FastAllocated { Future read( void* data, int length, int offset ) { if (valid) { + ++owner->countFileCachePageReadsHit; + ++owner->countCachePageReadsHit; owner->countFileCacheReadBytes += length; owner->countCacheReadBytes += length; memcpy( data, static_cast(this->data) + offset, length ); return yield(); } + ++owner->countFileCachePageReadsMissed; + ++owner->countCachePageReadsMissed; + if (notReading.isReady()) { notReading = readThrough( this ); } else { diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index ceb929f894..01a74ab763 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1578,6 +1578,68 @@ ACTOR static Future workloadStatusFetcher(Reference clusterSummaryStatisticsFetcher(WorkerEvents pMetrics, Future>>> storageServerFuture, + Future>>> tlogFuture, std::set *incomplete_reasons) +{ + state JsonBuilderObject statusObj; + try { + state JsonBuilderObject cacheStatistics; + + ErrorOr>> storageServers = wait(storageServerFuture); + + if (!storageServers.present()) { + throw storageServers.getError(); + } + + double storageCacheHitsHz = 0; + double storageCacheMissesHz = 0; + + for(auto &ss : storageServers.get()) { + auto processMetrics = pMetrics.find(ss.first.address()); + if(processMetrics != pMetrics.end()) { + int64_t hits = processMetrics->second.getInt64("CacheHits"); + int64_t misses = processMetrics->second.getInt64("CacheMisses"); + double elapsed = processMetrics->second.getDouble("Elapsed"); + storageCacheHitsHz += hits / elapsed; + storageCacheMissesHz += misses / elapsed; + } + } + + cacheStatistics["storage_hit_rate"] = (storageCacheMissesHz == 0) ? 1.0 : storageCacheHitsHz / (storageCacheHitsHz + storageCacheMissesHz); + + ErrorOr>> tlogServers = wait(tlogFuture); + + if(!tlogServers.present()) { + throw tlogServers.getError(); + } + + double logCacheHitsHz = 0; + double logCacheMissesHz = 0; + + for(auto &log : tlogServers.get()) { + auto processMetrics = pMetrics.find(log.first.address()); + if(processMetrics != pMetrics.end()) { + int64_t hits = processMetrics->second.getInt64("CacheHits"); + int64_t misses = processMetrics->second.getInt64("CacheMisses"); + double elapsed = processMetrics->second.getDouble("Elapsed"); + logCacheHitsHz += hits / elapsed; + logCacheMissesHz += misses / elapsed; + } + } + + cacheStatistics["log_hit_rate"] = (logCacheMissesHz == 0) ? 1.0 : logCacheHitsHz / (logCacheHitsHz + logCacheMissesHz); + statusObj["page_cache"] = cacheStatistics; + } + catch (Error& e) { + if (e.code() == error_code_actor_cancelled) + throw; + + incomplete_reasons->insert("Unknown cache statistics."); + } + + return statusObj; +} + static JsonBuilderArray oldTlogFetcher(int* oldLogFaultTolerance, Reference> db, std::unordered_map const& address_workers) { JsonBuilderArray oldTlogsArray; @@ -2025,6 +2087,7 @@ ACTOR Future clusterGetStatus( futures2.push_back(workloadStatusFetcher(db, workers, mWorker, rkWorker, &qos, &data_overlay, &status_incomplete_reasons, storageServerFuture)); futures2.push_back(layerStatusFetcher(cx, &messages, &status_incomplete_reasons)); futures2.push_back(lockedStatusFetcher(db, &messages, &status_incomplete_reasons)); + futures2.push_back(clusterSummaryStatisticsFetcher(pMetrics, storageServerFuture, tLogFuture, &status_incomplete_reasons)); state std::vector workerStatuses = wait(getAll(futures2)); @@ -2069,6 +2132,11 @@ ACTOR Future clusterGetStatus( statusObj.addContents(workerStatuses[3]); } + // Insert cluster summary statistics + if(!workerStatuses[4].empty()) { + statusObj.addContents(workerStatuses[4]); + } + // Need storage servers now for processStatusFetcher() below. ErrorOr>> _storageServers = wait(storageServerFuture); if (_storageServers.present()) { diff --git a/flow/SystemMonitor.h b/flow/SystemMonitor.h index 4c0585cd69..afc3584c36 100644 --- a/flow/SystemMonitor.h +++ b/flow/SystemMonitor.h @@ -124,8 +124,8 @@ struct NetworkData { countFileCachePageReadsMerged = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMerged")); countFileCacheFinds = getValue(LiteralStringRef("AsyncFile.CountCacheFinds")); countFileCacheReadBytes = getValue(LiteralStringRef("AsyncFile.CountCacheReadBytes")); - countFilePageCacheHits = getValue(LiteralStringRef("EvictablePageCache.CacheHits")); - countFilePageCacheMisses = getValue(LiteralStringRef("EvictablePageCache.CacheMisses")); + countFilePageCacheHits = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsHit")); + countFilePageCacheMisses = getValue(LiteralStringRef("AsyncFile.CountCachePageReadsMissed")); countFilePageCacheEvictions = getValue(LiteralStringRef("EvictablePageCache.CacheEvictions")); } }; From a380dda5e8e006c62f9a811cc1771eebc58671f6 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 10 Jul 2019 18:41:12 -0700 Subject: [PATCH 110/136] fixed a typo --- fdbserver/TagPartitionedLogSystem.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2dd94d7ac5..5fe2b28d74 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -2109,7 +2109,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterTags) { for(int i = 0; i < oldLogSystem->logRouterTags; i++) { Tag tag = Tag(tagLocalityLogRouter, i); - // Sattelite logs will index a mutation with tagLocalityLogRouter with an id greater than + // Satellite logs will index a mutation with tagLocalityLogRouter with an id greater than // the number of log routers as having an id mod the number of log routers. We thus need // to make sure that if we're going from more log routers in the previous generation to // less log routers in the newer one, that we map the log router tags onto satellites that From bbef631872ee6f7dc7b1babcf4d0a09512146f30 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 10 Jul 2019 18:48:54 -0700 Subject: [PATCH 111/136] fix: do not access optionInfo unless the option already exists in the map --- fdbclient/MultiVersionTransaction.actor.cpp | 11 ++++++++--- fdbclient/NativeAPI.actor.cpp | 8 +++++++- fdbclient/ReadYourWrites.actor.cpp | 8 +++++++- fdbclient/ThreadSafeTransaction.actor.cpp | 6 ++++++ fdbserver/workloads/FuzzApiCorrectness.actor.cpp | 1 + 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index 16bf8afd0a..eb973b2659 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -596,9 +596,14 @@ Version MultiVersionTransaction::getCommittedVersion() { } void MultiVersionTransaction::setOption(FDBTransactionOptions::Option option, Optional value) { - if(MultiVersionApi::apiVersionAtLeast(610) && FDBTransactionOptions::optionInfo[option].persistent) { + auto itr = FDBTransactionOptions::optionInfo.find(option); + if(itr == FDBTransactionOptions::optionInfo.end()) { + TraceEvent("UnknownTransactionOption").detail("Option", option); + throw invalid_option(); + } + + if(MultiVersionApi::apiVersionAtLeast(610) && itr->second.persistent) { persistentOptions.emplace_back(option, value.castTo>()); - } auto tr = getTransaction(); if(tr.transaction) { @@ -683,7 +688,7 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional throw invalid_option(); } - int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor; + int defaultFor = itr->second.defaultFor; if (defaultFor >= 0) { ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != FDBTransactionOptions::optionInfo.end()); diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 1d44b5a3cc..eec7ce36cd 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -756,7 +756,13 @@ uint64_t extractHexOption( StringRef value ) { } void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional value) { - int defaultFor = FDBDatabaseOptions::optionInfo[option].defaultFor; + auto itr = FDBDatabaseOptions::optionInfo.find(option); + if(itr == FDBDatabaseOptions::optionInfo.end()) { + TraceEvent("UnknownDatabaseOption").detail("Option", option); + throw invalid_option(); + } + + int defaultFor = itr->second.defaultFor; if (defaultFor >= 0) { ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != FDBTransactionOptions::optionInfo.end()); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 013fd61596..709fd5a734 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1759,9 +1759,15 @@ Future> ReadYourWritesTransaction::getVersionstamp() { } void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { + auto itr = FDBTransactionOptions::optionInfo.find(option); + if(itr == FDBTransactionOptions::optionInfo.end()) { + TraceEvent("UnknownTransactionOption").detail("Option", option); + throw invalid_option(); + } + setOptionImpl(option, value); - if(FDBTransactionOptions::optionInfo[option].persistent) { + if(itr->second.persistent) { persistentOptions.emplace_back(option, value.castTo>()); } diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 0a47c43407..c26170c8c8 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -283,6 +283,12 @@ ThreadFuture> ThreadSafeTransaction::getVersionstamp() { } void ThreadSafeTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { + auto itr = FDBTransactionOptions::optionInfo.find(option); + if(itr == FDBTransactionOptions::optionInfo.end()) { + TraceEvent("UnknownTransactionOption").detail("Option", option); + throw invalid_option(); + } + ReadYourWritesTransaction *tr = this->tr; Standalone> passValue = value; diff --git a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp index 0c02bbb556..80fe24c6ed 100644 --- a/fdbserver/workloads/FuzzApiCorrectness.actor.cpp +++ b/fdbserver/workloads/FuzzApiCorrectness.actor.cpp @@ -1092,6 +1092,7 @@ struct FuzzApiCorrectnessWorkload : TestWorkload { } contract = { + std::make_pair( error_code_invalid_option, ExceptionContract::Possible ), std::make_pair( error_code_invalid_option_value, ExceptionContract::Possible ), std::make_pair( error_code_client_invalid_operation, ExceptionContract::possibleIf((FDBTransactionOptions::Option)op == FDBTransactionOptions::READ_YOUR_WRITES_DISABLE || (FDBTransactionOptions::Option)op == FDBTransactionOptions::LOG_TRANSACTION) ), From f4366e69caa3da8983d296df41425a902f181c1a Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 11 Jul 2019 11:25:39 -0700 Subject: [PATCH 112/136] Unknown options should not be used internally (i.e. underneath thread-safe API). This commit removes various checks that options exist and replaces them with an ASSERT. --- fdbcli/fdbcli.actor.cpp | 6 +++--- fdbclient/FDBOptions.h | 18 +++++++++++++----- fdbclient/NativeAPI.actor.cpp | 8 +------- fdbclient/ReadYourWrites.actor.cpp | 9 +-------- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index a84712ddfd..67611efab1 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -181,7 +181,7 @@ public: private: //Sets a transaction option. If intrans == true, then this option is also applied to the passed in transaction. void setTransactionOption(Reference tr, FDBTransactionOptions::Option option, bool enabled, Optional arg, bool intrans) { - if(enabled && arg.present() != FDBTransactionOptions::optionInfo[option].hasParameter) { + if(enabled && arg.present() != FDBTransactionOptions::optionInfo.getMustExist(option).hasParameter) { printf("ERROR: option %s a parameter\n", arg.present() ? "did not expect" : "expected"); throw invalid_option_value(); } @@ -237,7 +237,7 @@ private: //Returns true if the specified option is documented bool isDocumented(typename T::Option option) { - FDBOptionInfo info = T::optionInfo[option]; + FDBOptionInfo info = T::optionInfo.getMustExist(option); std::string deprecatedStr = "Deprecated"; return !info.comment.empty() && info.comment.substr(0, deprecatedStr.size()) != deprecatedStr; @@ -259,7 +259,7 @@ private: void printHelpString() { for(auto itr = legalOptions.begin(); itr != legalOptions.end(); ++itr) { if(isDocumented(itr->second)) { - FDBOptionInfo info = T::optionInfo[itr->second]; + FDBOptionInfo info = T::optionInfo.getMustExist(itr->second); std::string helpStr = info.name + " - " + info.comment; if(info.hasParameter) helpStr += " " + info.parameterComment; diff --git a/fdbclient/FDBOptions.h b/fdbclient/FDBOptions.h index 677a54ee6a..80e00903ba 100644 --- a/fdbclient/FDBOptions.h +++ b/fdbclient/FDBOptions.h @@ -54,11 +54,19 @@ private: std::map optionInfo; public: - typename std::map::iterator begin() { return optionInfo.begin(); } - typename std::map::iterator end() { return optionInfo.end(); } - typename std::map::iterator find(const typename T::Option& key) { return optionInfo.find(key); } + typename std::map::const_iterator begin() const { return optionInfo.begin(); } + typename std::map::const_iterator end() const { return optionInfo.end(); } + typename std::map::const_iterator find(const typename T::Option& key) const { return optionInfo.find(key); } - FDBOptionInfo& operator[] (const typename T::Option& key) { return optionInfo[key]; } + void insert(const typename T::Option& key, FDBOptionInfo info) { + optionInfo[key] = info; + } + + FDBOptionInfo const& getMustExist(const typename T::Option& key) const { + auto itr = optionInfo.find(key); + ASSERT(itr != optionInfo.end()); + return itr->second; + } FDBOptionInfoMap() { T::init(); } }; @@ -88,6 +96,6 @@ public: typename OptionList::const_iterator end() const { return options.cend(); } }; -#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo[var] = FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor); +#define ADD_OPTION_INFO( type, var, name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor ) type::optionInfo.insert(var, FDBOptionInfo(name, comment, parameterComment, hasParameter, hidden, persistent, defaultFor)); #endif \ No newline at end of file diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index eec7ce36cd..c00fffafa9 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -756,13 +756,7 @@ uint64_t extractHexOption( StringRef value ) { } void DatabaseContext::setOption( FDBDatabaseOptions::Option option, Optional value) { - auto itr = FDBDatabaseOptions::optionInfo.find(option); - if(itr == FDBDatabaseOptions::optionInfo.end()) { - TraceEvent("UnknownDatabaseOption").detail("Option", option); - throw invalid_option(); - } - - int defaultFor = itr->second.defaultFor; + int defaultFor = FDBDatabaseOptions::optionInfo.getMustExist(option).defaultFor; if (defaultFor >= 0) { ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != FDBTransactionOptions::optionInfo.end()); diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 709fd5a734..2f862cb338 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1759,17 +1759,10 @@ Future> ReadYourWritesTransaction::getVersionstamp() { } void ReadYourWritesTransaction::setOption( FDBTransactionOptions::Option option, Optional value ) { - auto itr = FDBTransactionOptions::optionInfo.find(option); - if(itr == FDBTransactionOptions::optionInfo.end()) { - TraceEvent("UnknownTransactionOption").detail("Option", option); - throw invalid_option(); - } - setOptionImpl(option, value); - if(itr->second.persistent) { + if (FDBTransactionOptions::optionInfo.getMustExist(option).persistent) { persistentOptions.emplace_back(option, value.castTo>()); - } } From 8e3c8ee0c40478a978d339bcd9017a1e0a576b1c Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 11 Jul 2019 12:07:45 -0700 Subject: [PATCH 113/136] Fix a tab/space mismatch that I was responsible for. --- documentation/sphinx/source/mr-status-json-schemas.rst.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index ed07092ab1..90c5763e36 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -188,7 +188,7 @@ "hz":0.0 } }, - "run_loop_busy":0.2 // fraction of time the run loop was busy + "run_loop_busy":0.2 // fraction of time the run loop was busy } }, "old_logs":[ From 97609ad9917a8cb934b0a70f0b1c4ba76c93e653 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 11 Jul 2019 13:54:44 -0700 Subject: [PATCH 114/136] Add information about transaction starts at different priorities to status. --- .../sphinx/source/mr-status-json-schemas.rst.inc | 15 +++++++++++++++ fdbclient/Schemas.cpp | 15 +++++++++++++++ fdbserver/Status.actor.cpp | 15 ++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 90c5763e36..fc8f33dafb 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -411,6 +411,21 @@ "counter":0, "roughness":0.0 }, + "started_immediate_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "started_default_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "started_batch_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, "conflicted":{ "hz":0.0, "counter":0, diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index ec4488741e..c78fbbbb26 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -434,6 +434,21 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "counter":0, "roughness":0.0 }, + "started_immediate_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "started_default_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, + "started_batch_priority":{ + "hz":0.0, + "counter":0, + "roughness":0.0 + }, "conflicted":{ "hz":0.0, "counter":0, diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index ceb929f894..40d210271b 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1462,13 +1462,23 @@ ACTOR static Future workloadStatusFetcher(Reference proxyStats = wait(getAll(proxyStatFutures)); - StatusCounter mutations, mutationBytes, txnConflicts, txnStartOut, txnCommitOutSuccess; + StatusCounter mutations; + StatusCounter mutationBytes; + StatusCounter txnConflicts; + StatusCounter txnStartOut; + StatusCounter txnSystemPriorityStartOut; + StatusCounter txnDefaultPriorityStartOut; + StatusCounter txnBatchPriorityStartOut; + StatusCounter txnCommitOutSuccess; for (auto &ps : proxyStats) { mutations.updateValues( StatusCounter(ps.getValue("Mutations")) ); mutationBytes.updateValues( StatusCounter(ps.getValue("MutationBytes")) ); txnConflicts.updateValues( StatusCounter(ps.getValue("TxnConflicts")) ); txnStartOut.updateValues( StatusCounter(ps.getValue("TxnStartOut")) ); + txnSystemPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnSystemPriorityStartOut"))); + txnDefaultPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnDefaultPriorityStartOut"))); + txnBatchPriorityStartOut.updateValues(StatusCounter(ps.getValue("TxnBatchPriorityStartOut"))); txnCommitOutSuccess.updateValues( StatusCounter(ps.getValue("TxnCommitOutSuccess")) ); } @@ -1478,6 +1488,9 @@ ACTOR static Future workloadStatusFetcher(Reference Date: Thu, 11 Jul 2019 13:59:54 -0700 Subject: [PATCH 115/136] Add release note. --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 8be4172363..e82ee58f14 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -21,6 +21,7 @@ Status ------ * Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) `_. +* Added transaction start counts by priority to ``cluster.workload.transactions``. The new counters are named ``started_immediate_priority``, ``started_default_priority``, and ``started_batch_priority``. `(PR #1836) `_. * Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) `_. Bindings From 46d670b261cfaff777f70ec1bfe2542286906769 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 11 Jul 2019 14:02:34 -0700 Subject: [PATCH 116/136] Add release note. --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 9123615929..04ec8d171c 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -20,6 +20,7 @@ Status ------ * Added ``run_loop_busy`` to the ``processes`` section to record the fraction of time the run loop is busy. `(PR #1760) `_. +* Added ``cluster.page_cache`` section to status. In this section, added two new statistics ``storage_hit_rate`` and ``log_hit_rate`` that indicate the fraction of recent page reads that were served by cache. `(PR #1823) `_. * Remove ``cluster.datacenter_version_difference`` and replace it with ``cluster.datacenter_lag`` that has subfields ``versions`` and ``seconds``. `(PR #1800) `_. Bindings From 9c3591ff43fe6f14365b8828f2c968db461c2142 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Wed, 10 Jul 2019 21:55:24 -0700 Subject: [PATCH 117/136] Fix python3 test failure Both key and value has to be of type bytes. --- bindings/python/tests/size_limit_tests.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bindings/python/tests/size_limit_tests.py b/bindings/python/tests/size_limit_tests.py index 3072e153f8..6445e02e2d 100644 --- a/bindings/python/tests/size_limit_tests.py +++ b/bindings/python/tests/size_limit_tests.py @@ -36,14 +36,14 @@ def setValueWithLimit(tr, key, value, limit): def test_size_limit_option(db): db.options.set_transaction_timeout(2000) # 2 seconds db.options.set_transaction_retry_limit(3) - value = 'a' * 1024 + value = b'a' * 1024 - setValue(db, 't1', value) - assert(value == db['t1']) + setValue(db, b't1', value) + assert(value == db[b't1']) try: db.options.set_transaction_size_limit(1000) - setValue(db, 't2', value) + setValue(db, b't2', value) assert(False) # not reached except fdb.FDBError as e: assert(e.code == 2101) # Transaction exceeds byte limit (2101) @@ -51,7 +51,7 @@ def test_size_limit_option(db): # Per transaction option overrides database option db.options.set_transaction_size_limit(1000000) try: - setValueWithLimit(db, 't3', value, 1000) + setValueWithLimit(db, b't3', value, 1000) assert(False) # not reached except fdb.FDBError as e: assert(e.code == 2101) # Transaction exceeds byte limit (2101) @@ -60,9 +60,9 @@ def test_size_limit_option(db): db.options.set_transaction_size_limit(1000) tr = db.create_transaction() try: - tr['t4'] = 'bar' + tr[b't4'] = b'bar' tr.on_error(fdb.FDBError(1007)).wait() - setValue(tr, 't4', value) + setValue(tr, b't4', value) tr.commit().wait() assert(False) # not reached except fdb.FDBError as e: From 1cf036fc9f5cdaea6e7d2de8b162737c08ad5b8c Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Fri, 12 Jul 2019 10:43:42 -0700 Subject: [PATCH 118/136] Remove SnapTestAttrition from ctest SnapTestAttrition the test file was removed in a53bf928 --- tests/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f0913f2a26..3964790878 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -156,9 +156,6 @@ add_fdb_test( add_fdb_test( TEST_FILES restarting/from_6.2.0/SnapCycleRestart-1.txt restarting/from_6.2.0/SnapCycleRestart-2.txt) -add_fdb_test( - TEST_FILES restarting/from_6.2.0/SnapTestAttrition-1.txt - restarting/from_6.2.0/SnapTestAttrition-2.txt) add_fdb_test( TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt restarting/from_5.1.7/DrUpgradeRestart-2.txt IGNORE) From d5051b08dd1178963197a84585f4e4c386c4589b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 12 Jul 2019 16:12:35 -0700 Subject: [PATCH 119/136] Make trace event field lengths (and total event sizes) default knobified and configurable. Add a transaction option to control the field length of transaction debug logging. Make the program start command line field less likely to be truncated. --- bindings/flow/tester/Tester.actor.cpp | 7 +- bindings/go/src/_stacktester/stacktester.go | 6 +- bindings/go/src/fdb/generated.go | 34 ++++-- .../foundationdb/test/AsyncStackTester.java | 6 +- .../apple/foundationdb/test/StackTester.java | 6 +- bindings/python/tests/tester.py | 9 +- bindings/ruby/tests/tester.rb | 6 +- documentation/sphinx/source/api-c.rst | 2 + .../sphinx/source/api-common.rst.inc | 10 +- documentation/sphinx/source/api-python.rst | 10 ++ documentation/sphinx/source/api-ruby.rst | 10 ++ documentation/sphinx/source/data-modeling.rst | 2 + .../sphinx/source/developer-guide.rst | 2 + documentation/sphinx/source/release-notes.rst | 1 + fdbbackup/backup.actor.cpp | 3 + fdbcli/fdbcli.actor.cpp | 2 + fdbclient/ClientLogEvents.h | 102 ++++++++++++++---- fdbclient/NativeAPI.actor.cpp | 15 +++ fdbclient/NativeAPI.actor.h | 11 +- fdbclient/vexillographer/fdb.options | 17 +-- fdbserver/fdbserver.actor.cpp | 3 + flow/Knobs.cpp | 2 + flow/Knobs.h | 2 + flow/Trace.cpp | 51 +++++++-- flow/Trace.h | 14 +++ 25 files changed, 268 insertions(+), 65 deletions(-) diff --git a/bindings/flow/tester/Tester.actor.cpp b/bindings/flow/tester/Tester.actor.cpp index fa309a59e8..1d6702be05 100644 --- a/bindings/flow/tester/Tester.actor.cpp +++ b/bindings/flow/tester/Tester.actor.cpp @@ -1551,19 +1551,21 @@ struct UnitTestsFunc : InstructionFunc { const uint64_t noRetryLimit = -1; const uint64_t maxRetryDelay = 100; const uint64_t sizeLimit = 100000; + const uint64_t maxFieldLength = 1000; data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_LOCATION_CACHE_SIZE, Optional(StringRef((const uint8_t*)&locationCacheSize, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MAX_WATCHES, Optional(StringRef((const uint8_t*)&maxWatches, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_DATACENTER_ID, Optional(LiteralStringRef("dc_id"))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_MACHINE_ID, Optional(LiteralStringRef("machine_id"))); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE); + data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional(StringRef((const uint8_t*)&maxFieldLength, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional(StringRef((const uint8_t*)&timeout, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_TIMEOUT, Optional(StringRef((const uint8_t*)&noTimeout, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_MAX_RETRY_DELAY, Optional(StringRef((const uint8_t*)&maxRetryDelay, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_SIZE_LIMIT, Optional(StringRef((const uint8_t*)&sizeLimit, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional(StringRef((const uint8_t*)&retryLimit, 8))); data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_TRANSACTION_RETRY_LIMIT, Optional(StringRef((const uint8_t*)&noRetryLimit, 8))); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_ENABLE); - data->db->setDatabaseOption(FDBDatabaseOption::FDB_DB_OPTION_SNAPSHOT_RYW_DISABLE); state Reference tr = data->db->createTransaction(); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_PRIORITY_SYSTEM_IMMEDIATE); @@ -1574,6 +1576,7 @@ struct UnitTestsFunc : InstructionFunc { tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_READ_SYSTEM_KEYS); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TRANSACTION_LOGGING_MAX_FIELD_LENGTH, Optional(StringRef((const uint8_t*)&maxFieldLength, 8))); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_TIMEOUT, Optional(StringRef((const uint8_t*)&timeout, 8))); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_RETRY_LIMIT, Optional(StringRef((const uint8_t*)&retryLimit, 8))); tr->setOption(FDBTransactionOption::FDB_TR_OPTION_MAX_RETRY_DELAY, Optional(StringRef((const uint8_t*)&maxRetryDelay, 8))); diff --git a/bindings/go/src/_stacktester/stacktester.go b/bindings/go/src/_stacktester/stacktester.go index 5b5d988e6b..aef3f5ceda 100644 --- a/bindings/go/src/_stacktester/stacktester.go +++ b/bindings/go/src/_stacktester/stacktester.go @@ -793,13 +793,14 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { db.Options().SetMaxWatches(10001) db.Options().SetDatacenterId("dc_id") db.Options().SetMachineId("machine_id") + db.Options().SetSnapshotRywEnable() + db.Options().SetSnapshotRywDisable() + db.Options().SetTransactionLoggingMaxFieldLength(1000) db.Options().SetTransactionTimeout(100000) db.Options().SetTransactionTimeout(0) db.Options().SetTransactionMaxRetryDelay(100) db.Options().SetTransactionRetryLimit(10) db.Options().SetTransactionRetryLimit(-1) - db.Options().SetSnapshotRywEnable() - db.Options().SetSnapshotRywDisable() if !fdb.IsAPIVersionSelected() { log.Fatal("API version should be selected") @@ -836,6 +837,7 @@ func (sm *StackMachine) processInst(idx int, inst tuple.Tuple) { tr.Options().SetReadYourWritesDisable() tr.Options().SetReadSystemKeys() tr.Options().SetAccessSystemKeys() + tr.Options().SetTransactionLoggingMaxFieldLength(1000) tr.Options().SetTimeout(60 * 1000) tr.Options().SetRetryLimit(50) tr.Options().SetMaxRetryDelay(100) diff --git a/bindings/go/src/fdb/generated.go b/bindings/go/src/fdb/generated.go index 782b108fda..3435613de6 100644 --- a/bindings/go/src/fdb/generated.go +++ b/bindings/go/src/fdb/generated.go @@ -280,6 +280,23 @@ func (o DatabaseOptions) SetDatacenterId(param string) error { return o.setOpt(22, []byte(param)) } +// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior. +func (o DatabaseOptions) SetSnapshotRywEnable() error { + return o.setOpt(26, nil) +} + +// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300. +func (o DatabaseOptions) SetSnapshotRywDisable() error { + return o.setOpt(27, nil) +} + +// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This sets the ``transaction_logging_max_field_length`` option of each transaction created by this database. See the transaction option description for more information. +// +// Parameter: Maximum length of escaped key and value fields. +func (o DatabaseOptions) SetTransactionLoggingMaxFieldLength(param int64) error { + return o.setOpt(405, int64ToBytes(param)) +} + // Set a timeout in milliseconds which, when elapsed, will cause each transaction automatically to be cancelled. This sets the ``timeout`` option of each transaction created by this database. See the transaction option description for more information. Using this option requires that the API version is 610 or higher. // // Parameter: value in milliseconds of timeout @@ -308,16 +325,6 @@ func (o DatabaseOptions) SetTransactionSizeLimit(param int64) error { return o.setOpt(503, int64ToBytes(param)) } -// Snapshot read operations will see the results of writes done in the same transaction. This is the default behavior. -func (o DatabaseOptions) SetSnapshotRywEnable() error { - return o.setOpt(26, nil) -} - -// Snapshot read operations will not see the results of writes done in the same transaction. This was the default behavior prior to API version 300. -func (o DatabaseOptions) SetSnapshotRywDisable() error { - return o.setOpt(27, nil) -} - // The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault func (o TransactionOptions) SetCausalWriteRisky() error { return o.setOpt(10, nil) @@ -412,6 +419,13 @@ func (o TransactionOptions) SetLogTransaction() error { return o.setOpt(404, nil) } +// Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation. +// +// Parameter: Maximum length of escaped key and value fields. +func (o TransactionOptions) SetTransactionLoggingMaxFieldLength(param int64) error { + return o.setOpt(405, int64ToBytes(param)) +} + // Set a timeout in milliseconds which, when elapsed, will cause the transaction automatically to be cancelled. Valid parameter values are ``[0, INT_MAX]``. If set to 0, will disable all timeouts. All pending and any future uses of the transaction will throw an exception. The transaction can be used again after it is reset. Prior to API version 610, like all other transaction options, the timeout must be reset after a call to ``onError``. If the API version is 610 or greater, the timeout is not reset after an ``onError`` call. This allows the user to specify a longer timeout on specific transactions than the default timeout specified through the ``transaction_timeout`` database option without the shorter database timeout cancelling transactions that encounter a retryable error. Note that at all API versions, it is safe and legal to set the timeout each time the transaction begins, so most code written assuming the older behavior can be upgraded to the newer behavior without requiring any modification, and the caller is not required to implement special logic in retry loops to only conditionally set this option. // // Parameter: value in milliseconds of timeout diff --git a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java index 6e2b6e9318..7920d9d9dc 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/AsyncStackTester.java @@ -481,13 +481,14 @@ public class AsyncStackTester { db.options().setMaxWatches(10001); db.options().setDatacenterId("dc_id"); db.options().setMachineId("machine_id"); + db.options().setSnapshotRywEnable(); + db.options().setSnapshotRywDisable(); + db.options().setTransactionLoggingMaxFieldLength(1000); db.options().setTransactionTimeout(100000); db.options().setTransactionTimeout(0); db.options().setTransactionMaxRetryDelay(100); db.options().setTransactionRetryLimit(10); db.options().setTransactionRetryLimit(-1); - db.options().setSnapshotRywEnable(); - db.options().setSnapshotRywDisable(); tr.options().setPrioritySystemImmediate(); tr.options().setPriorityBatch(); @@ -496,6 +497,7 @@ public class AsyncStackTester { tr.options().setReadYourWritesDisable(); tr.options().setReadSystemKeys(); tr.options().setAccessSystemKeys(); + tr.options().setTransactionLoggingMaxFieldLength(1000); tr.options().setTimeout(60*1000); tr.options().setRetryLimit(50); tr.options().setMaxRetryDelay(100); diff --git a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java index 20c9770bc2..a9cf47320f 100644 --- a/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java +++ b/bindings/java/src/test/com/apple/foundationdb/test/StackTester.java @@ -434,13 +434,14 @@ public class StackTester { db.options().setMaxWatches(10001); db.options().setDatacenterId("dc_id"); db.options().setMachineId("machine_id"); + db.options().setSnapshotRywEnable(); + db.options().setSnapshotRywDisable(); + db.options().setTransactionLoggingMaxFieldLength(1000); db.options().setTransactionTimeout(100000); db.options().setTransactionTimeout(0); db.options().setTransactionMaxRetryDelay(100); db.options().setTransactionRetryLimit(10); db.options().setTransactionRetryLimit(-1); - db.options().setSnapshotRywEnable(); - db.options().setSnapshotRywDisable(); tr.options().setPrioritySystemImmediate(); tr.options().setPriorityBatch(); @@ -449,6 +450,7 @@ public class StackTester { tr.options().setReadYourWritesDisable(); tr.options().setReadSystemKeys(); tr.options().setAccessSystemKeys(); + tr.options().setTransactionLoggingMaxFieldLength(1000); tr.options().setTimeout(60*1000); tr.options().setRetryLimit(50); tr.options().setMaxRetryDelay(100); diff --git a/bindings/python/tests/tester.py b/bindings/python/tests/tester.py index 95aa36ea3e..b5dc84dbd3 100644 --- a/bindings/python/tests/tester.py +++ b/bindings/python/tests/tester.py @@ -128,9 +128,13 @@ class Instruction: def test_db_options(db): + db.options.set_location_cache_size(100001) db.options.set_max_watches(100001) db.options.set_datacenter_id("dc_id") db.options.set_machine_id("machine_id") + db.options.set_snapshot_ryw_enable() + db.options.set_snapshot_ryw_disable() + db.options.set_transaction_logging_max_field_length(1000) db.options.set_transaction_timeout(100000) db.options.set_transaction_timeout(0) db.options.set_transaction_timeout(0) @@ -138,8 +142,6 @@ def test_db_options(db): db.options.set_transaction_size_limit(100000) db.options.set_transaction_retry_limit(10) db.options.set_transaction_retry_limit(-1) - db.options.set_snapshot_ryw_enable() - db.options.set_snapshot_ryw_disable() @fdb.transactional @@ -151,6 +153,7 @@ def test_options(tr): tr.options.set_read_your_writes_disable() tr.options.set_read_system_keys() tr.options.set_access_system_keys() + tr.options.set_transaction_logging_max_field_length(1000) tr.options.set_timeout(60 * 1000) tr.options.set_retry_limit(50) tr.options.set_max_retry_delay(100) @@ -545,8 +548,6 @@ class Tester: inst.push(b"WAITED_FOR_EMPTY") elif inst.op == six.u("UNIT_TESTS"): try: - db.options.set_location_cache_size(100001) - test_db_options(db) test_options(db) test_watches(db) diff --git a/bindings/ruby/tests/tester.rb b/bindings/ruby/tests/tester.rb index 829ecf8a5f..c199eddc09 100755 --- a/bindings/ruby/tests/tester.rb +++ b/bindings/ruby/tests/tester.rb @@ -456,14 +456,15 @@ class Tester @db.options.set_max_watches(10001) @db.options.set_datacenter_id("dc_id") @db.options.set_machine_id("machine_id") + @db.options.set_snapshot_ryw_enable() + @db.options.set_snapshot_ryw_disable() + @db.options.set_transaction_logging_max_field_length(1000) @db.options.set_transaction_timeout(100000) @db.options.set_transaction_timeout(0) @db.options.set_transaction_max_retry_delay(100) @db.options.set_transaction_size_limit(100000) @db.options.set_transaction_retry_limit(10) @db.options.set_transaction_retry_limit(-1) - @db.options.set_snapshot_ryw_enable() - @db.options.set_snapshot_ryw_disable() @db.transact do |tr| tr.options.set_priority_system_immediate @@ -473,6 +474,7 @@ class Tester tr.options.set_read_your_writes_disable tr.options.set_read_system_keys tr.options.set_access_system_keys + tr.options.set_transaction_logging_max_field_length(1000) tr.options.set_timeout(60*1000) tr.options.set_retry_limit(50) tr.options.set_max_retry_delay(100) diff --git a/documentation/sphinx/source/api-c.rst b/documentation/sphinx/source/api-c.rst index 1e5d0243dc..a57b9a052b 100644 --- a/documentation/sphinx/source/api-c.rst +++ b/documentation/sphinx/source/api-c.rst @@ -49,6 +49,8 @@ .. |max-retry-delay-database-option| replace:: FIXME .. |transaction-size-limit-database-option| replace:: FIXME .. |timeout-database-option| replace:: FIXME +.. |transaction-logging-max-field-length-database-option| replace:: FIXME +.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME .. include:: api-common.rst.inc diff --git a/documentation/sphinx/source/api-common.rst.inc b/documentation/sphinx/source/api-common.rst.inc index 3c99c45382..3862fea779 100644 --- a/documentation/sphinx/source/api-common.rst.inc +++ b/documentation/sphinx/source/api-common.rst.inc @@ -326,6 +326,10 @@ If this option has been set more times with this database than the disable option, snapshot reads will *not* see the effects of prior writes in the same transaction. Disabling this option is equivalent to calling |snapshot-ryw-disable-transaction-option| on each transaction created by this database. +.. |option-db-tr-transaction-logging-max-field-length-blurb| replace:: + + Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option. This is equivalent to calling |transaction-logging-max-field-length-transaction-option| on each transaction created by this database. + .. |transaction-options-blurb| replace:: Transaction options alter the behavior of FoundationDB transactions. FoundationDB defaults to extremely safe transaction behavior, and we have worked hard to make the performance excellent with the default setting, so you should not often need to use transaction options. @@ -411,7 +415,7 @@ .. |option-set-timeout-blurb3| replace:: - Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops. One can set the default timeout for all transactions by calling |timeout-database-option|. + Prior to API version 610, like all other transaction options, a timeout must be reset after a call to |on-error-func|. Note that resetting this option resets only the timeout *duration*, not the starting point from which the time is measured. If the API version is 610 or newer, then the timeout is not reset. This allows the user to specify a timeout for specific transactions that is longer than the timeout specified by |timeout-database-option|. Note that at all API versions, it is safe and legal to call this option after each call to |on-error-func|, so most code written assuming the older behavior can be upgraded without requiring any modification. This also means that there is no need to introduce logic to conditionally set this option within retry loops. One can set the default timeout for all transactions by calling |timeout-database-option|. .. |option-next-write-no-write-conflict-range-blurb| replace:: @@ -421,6 +425,10 @@ Care needs to be taken when using this option on a transaction that is shared between multiple threads. When setting this option, write conflict ranges will be disabled on the next write operation, regardless of what thread it is on. +.. |option-set-transaction-logging-max-field-length-blurb| replace:: + + Sets the maximum escaped length of key and value fields to be logged to the trace file via the LOG_TRANSACTION option, after which the field will be truncated. A negative value disables truncation. One can set the default max field length for all transactions by calling |transaction-logging-max-field-length-database-option|. + .. |future-blurb1| replace:: Many FoundationDB API functions return "future" objects. A brief overview of futures is included in the :doc:`class scheduling tutorial `. Most future objects behave just like a normal object, but block when you use them for the first time if the asynchronous function which returned the future has not yet completed its action. A future object is considered ready when either a value is available, or when an error has occurred. diff --git a/documentation/sphinx/source/api-python.rst b/documentation/sphinx/source/api-python.rst index 68ae70b1fa..a063c9f234 100644 --- a/documentation/sphinx/source/api-python.rst +++ b/documentation/sphinx/source/api-python.rst @@ -25,6 +25,7 @@ .. |timeout-database-option| replace:: :func:`Database.options.set_transaction_timeout` .. |max-retry-delay-database-option| replace:: :func:`Database.options.set_transaction_max_retry_delay` .. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit` +.. |transaction-logging-max-field-length-database-option| replace:: :func:`Database.options.set_transaction_logging_max_field_length` .. |snapshot-ryw-enable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_enable` .. |snapshot-ryw-disable-database-option| replace:: :func:`Database.options.set_snapshot_ryw_disable` .. |future-type-string| replace:: a :ref:`future ` @@ -35,6 +36,7 @@ .. |size-limit-transaction-option| replace:: :func:`Transaction.options.set_size_limit` .. |snapshot-ryw-enable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_enable` .. |snapshot-ryw-disable-transaction-option| replace:: :func:`Transaction.options.set_snapshot_ryw_disable` +.. |transaction-logging-max-field-length-transaction-option| replace:: :func:`Transaction.options.set_transaction_logging_max_field_length` .. |lazy-iterator-object| replace:: generator .. |key-meth| replace:: :meth:`Subspace.key` .. |directory-subspace| replace:: :ref:`DirectorySubspace ` @@ -384,6 +386,10 @@ Database options |option-db-tr-size-limit-blurb| +.. method:: Database.options.set_transaction_logging_max_field_length(size_limit) + + |option-db-tr-transaction-logging-max-field-length-blurb| + .. method:: Database.options.set_snapshot_ryw_enable() |option-db-snapshot-ryw-enable-blurb| @@ -855,6 +861,10 @@ Transaction options |option-set-timeout-blurb3| +.. method:: Transaction.options.set_transaction_logging_max_field_length(size_limit) + + |option-set-transaction-logging-max-field-length-blurb| + .. _api-python-future: Future objects diff --git a/documentation/sphinx/source/api-ruby.rst b/documentation/sphinx/source/api-ruby.rst index cc35ad68b8..d363feecd1 100644 --- a/documentation/sphinx/source/api-ruby.rst +++ b/documentation/sphinx/source/api-ruby.rst @@ -25,6 +25,7 @@ .. |transaction-size-limit-database-option| replace:: :func:`Database.options.set_transaction_size_limit` .. |snapshot-ryw-enable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_enable` .. |snapshot-ryw-disable-database-option| replace:: :meth:`Database.options.set_snapshot_ryw_disable` +.. |transaction-logging-max-field-length-database-option| replace:: :meth:`Database.options.set_transaction_logging_max_field_length` .. |future-type-string| replace:: a :class:`Future` .. |read-your-writes-disable-option| replace:: :meth:`Transaction.options.set_read_your_writes_disable` .. |retry-limit-transaction-option| replace:: :meth:`Transaction.options.set_retry_limit` @@ -33,6 +34,7 @@ .. |size-limit-transaction-option| replace:: :meth:`Transaction.options.set_size_limit` .. |snapshot-ryw-enable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_enable` .. |snapshot-ryw-disable-transaction-option| replace:: :meth:`Transaction.options.set_snapshot_ryw_disable` +.. |transaction-logging-max-field-length-transaction-option| replace:: :meth:`Transaction.options.set_transaction_logging_max_field_length` .. |lazy-iterator-object| replace:: :class:`Enumerator` .. |key-meth| replace:: :meth:`Subspace.key` .. |directory-subspace| replace:: :class:`DirectorySubspace` @@ -380,6 +382,10 @@ Database options |option-db-tr-size-limit-blurb| +.. method:: Database.options.set_transaction_logging_max_field_length(size_limit) -> nil + + |option-db-tr-transaction-logging-max-field-length-blurb| + .. method:: Database.options.set_snapshot_ryw_enable() -> nil |option-db-snapshot-ryw-enable-blurb| @@ -797,6 +803,10 @@ Transaction options |option-set-timeout-blurb3| +.. method:: Transaction.options.set_size_limit(size_limit) -> nil + + |option-set-transaction-logging-max-field-length-blurb| + .. _transact: The transact method diff --git a/documentation/sphinx/source/data-modeling.rst b/documentation/sphinx/source/data-modeling.rst index 51867006af..8fcde06958 100644 --- a/documentation/sphinx/source/data-modeling.rst +++ b/documentation/sphinx/source/data-modeling.rst @@ -51,6 +51,8 @@ .. |max-retry-delay-database-option| replace:: FIXME .. |transaction-size-limit-database-option| replace:: FIXME .. |timeout-database-option| replace:: FIXME +.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME +.. |transaction-logging-max-field-length-database-option| replace:: FIXME .. include:: api-common.rst.inc diff --git a/documentation/sphinx/source/developer-guide.rst b/documentation/sphinx/source/developer-guide.rst index e8d6335b6f..0d7f24f661 100644 --- a/documentation/sphinx/source/developer-guide.rst +++ b/documentation/sphinx/source/developer-guide.rst @@ -51,6 +51,8 @@ .. |max-retry-delay-database-option| replace:: FIXME .. |transaction-size-limit-database-option| replace:: FIXME .. |timeout-database-option| replace:: FIXME +.. |transaction-logging-max-field-length-transaction-option| replace:: FIXME +.. |transaction-logging-max-field-length-database-option| replace:: FIXME .. include:: api-common.rst.inc diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 9817574b41..e53eeab2ca 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -30,6 +30,7 @@ Bindings * Go: The Go bindings now require Go version 1.11 or later. * Go: Fix issue with finalizers running too early that could lead to undefined behavior. `(PR #1451) `_. +* Added transaction option to control the field length of keys and values in debug transaction logging in order to avoid truncation. `(PR #) `_. Other Changes ------------- diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 5d62be9a8b..a3bf12c493 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -3194,11 +3194,14 @@ int main(int argc, char* argv[]) { } TraceEvent("ProgramStart") + .setMaxEventLength(12000) .detail("SourceVersion", getHGVersion()) .detail("Version", FDB_VT_VERSION ) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) + .setMaxFieldLength(10000) .detail("CommandLine", commandLine) + .setMaxFieldLength(0) .detail("MemoryLimit", memLimit) .trackLatest("ProgramStart"); diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 67611efab1..9bbd0977a4 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -2529,12 +2529,14 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (opt.trace) { TraceEvent("CLIProgramStart") + .setMaxEventLength(12000) .detail("SourceVersion", getHGVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) .detail("ClusterFile", ccf->getFilename().c_str()) .detail("ConnectionString", ccf->getConnectionString().toString()) + .setMaxFieldLength(10000) .detail("CommandLine", opt.commandLine) .trackLatest("ProgramStart"); } diff --git a/fdbclient/ClientLogEvents.h b/fdbclient/ClientLogEvents.h index 14191bf209..9be2b78798 100644 --- a/fdbclient/ClientLogEvents.h +++ b/fdbclient/ClientLogEvents.h @@ -44,7 +44,7 @@ namespace FdbClientLogEvents { EventType type{ EVENTTYPEEND }; double startTs{ 0 }; - void logEvent(std::string id) const {} + void logEvent(std::string id, int maxFieldLength) const {} }; struct EventGetVersion : public Event { @@ -60,8 +60,10 @@ namespace FdbClientLogEvents { double latency; - void logEvent(std::string id) const { - TraceEvent("TransactionTrace_GetVersion").detail("TransactionID", id).detail("Latency", latency); + void logEvent(std::string id, int maxFieldLength) const override { + TraceEvent("TransactionTrace_GetVersion") + .detail("TransactionID", id) + .detail("Latency", latency); } }; @@ -80,8 +82,14 @@ namespace FdbClientLogEvents { int valueSize; Key key; - void logEvent(std::string id) const { - TraceEvent("TransactionTrace_Get").detail("TransactionID", id).detail("Latency", latency).detail("ValueSizeBytes", valueSize).detail("Key", key); + void logEvent(std::string id, int maxFieldLength) const override { + TraceEvent("TransactionTrace_Get") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .detail("Latency", latency) + .detail("ValueSizeBytes", valueSize) + .setMaxFieldLength(maxFieldLength) + .detail("Key", key); } }; @@ -101,8 +109,15 @@ namespace FdbClientLogEvents { Key startKey; Key endKey; - void logEvent(std::string id) const { - TraceEvent("TransactionTrace_GetRange").detail("TransactionID", id).detail("Latency", latency).detail("RangeSizeBytes", rangeSize).detail("StartKey", startKey).detail("EndKey", endKey); + void logEvent(std::string id, int maxFieldLength) const override { + TraceEvent("TransactionTrace_GetRange") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .detail("Latency", latency) + .detail("RangeSizeBytes", rangeSize) + .setMaxFieldLength(maxFieldLength) + .detail("StartKey", startKey) + .detail("EndKey", endKey); } }; @@ -122,20 +137,38 @@ namespace FdbClientLogEvents { int commitBytes; CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized - void logEvent(std::string id) const { + void logEvent(std::string id, int maxFieldLength) const override { for (auto &read_range : req.transaction.read_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end); + TraceEvent("TransactionTrace_Commit_ReadConflictRange") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Begin", read_range.begin) + .detail("End", read_range.end); } for (auto &write_range : req.transaction.write_conflict_ranges) { - TraceEvent("TransactionTrace_Commit_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end); + TraceEvent("TransactionTrace_Commit_WriteConflictRange") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Begin", write_range.begin) + .detail("End", write_range.end); } for (auto &mutation : req.transaction.mutations) { - TraceEvent("TransactionTrace_Commit_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString()); + TraceEvent("TransactionTrace_Commit_Mutation") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Mutation", mutation.toString()); } - TraceEvent("TransactionTrace_Commit").detail("TransactionID", id).detail("Latency", latency).detail("NumMutations", numMutations).detail("CommitSizeBytes", commitBytes); + TraceEvent("TransactionTrace_Commit") + .detail("TransactionID", id) + .detail("Latency", latency) + .detail("NumMutations", numMutations) + .detail("CommitSizeBytes", commitBytes); } }; @@ -153,8 +186,13 @@ namespace FdbClientLogEvents { int errCode; Key key; - void logEvent(std::string id) const { - TraceEvent("TransactionTrace_GetError").detail("TransactionID", id).detail("ErrCode", errCode).detail("Key", key); + void logEvent(std::string id, int maxFieldLength) const override { + TraceEvent("TransactionTrace_GetError") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .detail("ErrCode", errCode) + .setMaxFieldLength(maxFieldLength) + .detail("Key", key); } }; @@ -173,8 +211,14 @@ namespace FdbClientLogEvents { Key startKey; Key endKey; - void logEvent(std::string id) const { - TraceEvent("TransactionTrace_GetRangeError").detail("TransactionID", id).detail("ErrCode", errCode).detail("StartKey", startKey).detail("EndKey", endKey); + void logEvent(std::string id, int maxFieldLength) const override { + TraceEvent("TransactionTrace_GetRangeError") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .detail("ErrCode", errCode) + .setMaxFieldLength(maxFieldLength) + .detail("StartKey", startKey) + .detail("EndKey", endKey); } }; @@ -192,20 +236,36 @@ namespace FdbClientLogEvents { int errCode; CommitTransactionRequest req; // Only CommitTransactionRef and Arena object within CommitTransactionRequest is serialized - void logEvent(std::string id) const { + void logEvent(std::string id, int maxFieldLength) const override { for (auto &read_range : req.transaction.read_conflict_ranges) { - TraceEvent("TransactionTrace_CommitError_ReadConflictRange").detail("TransactionID", id).detail("Begin", read_range.begin).detail("End", read_range.end); + TraceEvent("TransactionTrace_CommitError_ReadConflictRange") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Begin", read_range.begin) + .detail("End", read_range.end); } for (auto &write_range : req.transaction.write_conflict_ranges) { - TraceEvent("TransactionTrace_CommitError_WriteConflictRange").detail("TransactionID", id).detail("Begin", write_range.begin).detail("End", write_range.end); + TraceEvent("TransactionTrace_CommitError_WriteConflictRange") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Begin", write_range.begin) + .detail("End", write_range.end); } for (auto &mutation : req.transaction.mutations) { - TraceEvent("TransactionTrace_CommitError_Mutation").detail("TransactionID", id).detail("Mutation", mutation.toString()); + TraceEvent("TransactionTrace_CommitError_Mutation") + .setMaxEventLength(-1) + .detail("TransactionID", id) + .setMaxFieldLength(maxFieldLength) + .detail("Mutation", mutation.toString()); } - TraceEvent("TransactionTrace_CommitError").detail("TransactionID", id).detail("ErrCode", errCode); + TraceEvent("TransactionTrace_CommitError") + .detail("TransactionID", id) + .detail("ErrCode", errCode); } }; } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index c00fffafa9..ea9923a462 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -2994,6 +2994,7 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional(new TransactionLogInfo(value.get().printable(), TransactionLogInfo::DONT_LOG)); + trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength; } break; @@ -3008,6 +3009,20 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional::max()); + if(maxFieldLength == 0) { + throw invalid_option_value(); + } + options.maxTransactionLoggingFieldLength = maxFieldLength; + } + if(trLogInfo) { + trLogInfo->maxFieldLength = options.maxTransactionLoggingFieldLength; + } + break; + case FDBTransactionOptions::MAX_RETRY_DELAY: validateOptionValue(value, true); options.maxBackoff = extractIntOption(value, 0, std::numeric_limits::max()) / 1000.0; diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 218f1f09d9..1338ffafca 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -149,6 +149,7 @@ struct TransactionOptions { double maxBackoff; uint32_t getReadVersionFlags; uint32_t sizeLimit; + int maxTransactionLoggingFieldLength; bool checkWritesEnabled : 1; bool causalWriteRisky : 1; bool commitOnFirstProxy : 1; @@ -174,17 +175,18 @@ struct TransactionInfo { struct TransactionLogInfo : public ReferenceCounted, NonCopyable { enum LoggingLocation { DONT_LOG = 0, TRACE_LOG = 1, DATABASE = 2 }; - TransactionLogInfo() : logLocation(DONT_LOG) {} - TransactionLogInfo(LoggingLocation location) : logLocation(location) {} - TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id) {} + TransactionLogInfo() : logLocation(DONT_LOG), maxFieldLength(0) {} + TransactionLogInfo(LoggingLocation location) : logLocation(location), maxFieldLength(0) {} + TransactionLogInfo(std::string id, LoggingLocation location) : logLocation(location), identifier(id), maxFieldLength(0) {} void setIdentifier(std::string id) { identifier = id; } void logTo(LoggingLocation loc) { logLocation = logLocation | loc; } + template void addLog(const T& event) { if(logLocation & TRACE_LOG) { ASSERT(!identifier.empty()) - event.logEvent(identifier); + event.logEvent(identifier, maxFieldLength); } if (flushed) { @@ -202,6 +204,7 @@ struct TransactionLogInfo : public ReferenceCounted, NonCopy bool logsAdded{ false }; bool flushed{ false }; int logLocation; + int maxFieldLength; std::string identifier; }; diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index 775e443137..6044403dc6 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -146,6 +146,15 @@ description is not currently required but encouraged.