diff --git a/bindings/bindingtester/bindingtester.py b/bindings/bindingtester/bindingtester.py index 5a60d1112a..559244233b 100755 --- a/bindings/bindingtester/bindingtester.py +++ b/bindings/bindingtester/bindingtester.py @@ -68,6 +68,10 @@ class ResultSet(object): self.tester_results[name] = results + @staticmethod + def _min_tuple(t1, t2): + return t1 if fdb.tuple.compare(t1, t2) < 0 else t2 + def check_for_errors(self): if len(self.tester_results) == 1: return (0, False) @@ -97,7 +101,7 @@ class ResultSet(object): # If these results aren't using sequence numbers, then we match two results based on whether they share the same key else: - min_key = min([r.key(self.specification) for r in results.values()]) + min_key = reduce(ResultSet._min_tuple, [r.key(self.specification) for r in results.values()]) results = {i: r for i, r in results.items() if Result.tuples_match(r.key(self.specification), min_key)} # Increment the indices for those testers which produced a result in this iteration diff --git a/bindings/flow/fdb_flow.actor.cpp b/bindings/flow/fdb_flow.actor.cpp index 96512a0ce4..99af1a665e 100644 --- a/bindings/flow/fdb_flow.actor.cpp +++ b/bindings/flow/fdb_flow.actor.cpp @@ -85,7 +85,7 @@ void fdb_flow_test() { openTraceFile(NetworkAddress(), 1000000, 1000000, "."); systemMonitor(); - uncancellable(recurring(&systemMonitor, 5.0, TaskFlushTrace)); + uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace)); Future t = _test(); @@ -179,7 +179,7 @@ namespace FDB { } void backToFutureCallback( FDBFuture* f, void* data ) { - g_network->onMainThread( Promise((SAV*)data), TaskDefaultOnMainThread ); // SOMEDAY: think about this priority + g_network->onMainThread( Promise((SAV*)data), TaskPriority::DefaultOnMainThread ); // SOMEDAY: think about this priority } // backToFuture( FDBFuture*, (FDBFuture* -> Type) ) -> Future diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index cb6669bfd0..5f08800996 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.1.10.pkg `_ +* `FoundationDB-6.1.11.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.1.10-1_amd64.deb `_ -* `foundationdb-server-6.1.10-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.1.11-1_amd64.deb `_ +* `foundationdb-server-6.1.11-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.1.10-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.1.10-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.1.11-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.1.11-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.1.10-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.1.10-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.1.11-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.1.11-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.1.10-x64.msi `_ +* `foundationdb-6.1.11-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.1.10.tar.gz `_ +* `foundationdb-6.1.11.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.1.10.gem `_ +* `fdb-6.1.11.gem `_ Java 8+ ------- -* `fdb-java-6.1.10.jar `_ -* `fdb-java-6.1.10-javadoc.jar `_ +* `fdb-java-6.1.11.jar `_ +* `fdb-java-6.1.11-javadoc.jar `_ Go 1.11+ -------- diff --git a/documentation/sphinx/source/old-release-notes/release-notes-610.rst b/documentation/sphinx/source/old-release-notes/release-notes-610.rst index 2e07e729ae..2a43886f82 100644 --- a/documentation/sphinx/source/old-release-notes/release-notes-610.rst +++ b/documentation/sphinx/source/old-release-notes/release-notes-610.rst @@ -2,6 +2,14 @@ Release Notes ############# +6.1.11 +====== + +Fixes +----- + +* Machines which were added to a cluster immediately after the cluster was upgraded to 6.1 would not be given data. `(PR #1764) `_ + 6.1.10 ====== @@ -174,4 +182,4 @@ Earlier release notes * :doc:`Beta 2 (API Version 22) ` * :doc:`Beta 1 (API Version 21) ` * :doc:`Alpha 6 (API Version 16) ` -* :doc:`Alpha 5 (API Version 14) ` +* :doc:`Alpha 5 (API Version 14) ` \ No newline at end of file diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index f2a9813030..c4d248ba48 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -14,6 +14,8 @@ Performance Fixes ----- +* If a cluster is upgraded during an ``onError`` call, the cluster could return a ``cluster_version_changed`` error. `(PR #1734) `_. + Status ------ diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 00327def46..ebb63a82b9 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -59,31 +59,44 @@ extern const char* getHGVersion(); std::vector validOptions; -enum { OPT_CONNFILE, OPT_DATABASE, OPT_HELP, OPT_TRACE, OPT_TRACE_DIR, OPT_TIMEOUT, OPT_EXEC, OPT_NO_STATUS, OPT_STATUS_FROM_JSON, OPT_VERSION, OPT_TRACE_FORMAT }; +enum { + OPT_CONNFILE, + OPT_DATABASE, + OPT_HELP, + OPT_TRACE, + OPT_TRACE_DIR, + OPT_TIMEOUT, + OPT_EXEC, + OPT_NO_STATUS, + OPT_STATUS_FROM_JSON, + OPT_VERSION, + OPT_TRACE_FORMAT, + OPT_USE_OBJECT_SERIALIZER +}; -CSimpleOpt::SOption g_rgOptions[] = { - { OPT_CONNFILE, "-C", SO_REQ_SEP }, - { OPT_CONNFILE, "--cluster_file", SO_REQ_SEP }, - { OPT_DATABASE, "-d", SO_REQ_SEP }, - { OPT_TRACE, "--log", SO_NONE }, - { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP }, - { OPT_TIMEOUT, "--timeout", SO_REQ_SEP }, - { OPT_EXEC, "--exec", SO_REQ_SEP }, - { OPT_NO_STATUS, "--no-status", SO_NONE }, - { OPT_HELP, "-?", SO_NONE }, - { OPT_HELP, "-h", SO_NONE }, - { OPT_HELP, "--help", SO_NONE }, - { OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP }, - { OPT_VERSION, "--version", SO_NONE }, - { OPT_VERSION, "-v", SO_NONE }, - { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, +CSimpleOpt::SOption g_rgOptions[] = { { OPT_CONNFILE, "-C", SO_REQ_SEP }, + { OPT_CONNFILE, "--cluster_file", SO_REQ_SEP }, + { OPT_DATABASE, "-d", SO_REQ_SEP }, + { OPT_TRACE, "--log", SO_NONE }, + { OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP }, + { OPT_TIMEOUT, "--timeout", SO_REQ_SEP }, + { OPT_EXEC, "--exec", SO_REQ_SEP }, + { OPT_NO_STATUS, "--no-status", SO_NONE }, + { OPT_HELP, "-?", SO_NONE }, + { OPT_HELP, "-h", SO_NONE }, + { OPT_HELP, "--help", SO_NONE }, + { OPT_STATUS_FROM_JSON, "--status-from-json", SO_REQ_SEP }, + { OPT_VERSION, "--version", SO_NONE }, + { OPT_VERSION, "-v", SO_NONE }, + { OPT_TRACE_FORMAT, "--trace_format", SO_REQ_SEP }, + { OPT_USE_OBJECT_SERIALIZER, "-S", SO_REQ_SEP }, + { OPT_USE_OBJECT_SERIALIZER, "--object-serializer", SO_REQ_SEP }, #ifndef TLS_DISABLED - TLS_OPTION_FLAGS + TLS_OPTION_FLAGS #endif - SO_END_OF_OPTIONS -}; + SO_END_OF_OPTIONS }; void printAtCol(const char* text, int col) { const char* iter = text; @@ -401,21 +414,25 @@ static void printProgramUsage(const char* name) { " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" " then `%s'.\n", platform::getDefaultClusterFilePath().c_str()); printf(" --log Enables trace file logging for the CLI session.\n" - " --log-dir PATH Specifes the output directory for trace files. If\n" - " unspecified, defaults to the current directory. Has\n" - " no effect unless --log is specified.\n" - " --trace_format FORMAT\n" - " Select the format of the log files. xml (the default) and json\n" - " are supported. Has no effect unless --log is specified.\n" - " --exec CMDS Immediately executes the semicolon separated CLI commands\n" - " and then exits.\n" - " --no-status Disables the initial status check done when starting\n" - " the CLI.\n" + " --log-dir PATH Specifes the output directory for trace files. If\n" + " unspecified, defaults to the current directory. Has\n" + " no effect unless --log is specified.\n" + " --trace_format FORMAT\n" + " Select the format of the log files. xml (the default) and json\n" + " are supported. Has no effect unless --log is specified.\n" + " -S ON|OFF, --object-serializer ON|OFF\n" + " Use object serializer for sending messages. The object serializer\n" + " is currently a beta feature and it allows fdb processes to talk to\n" + " each other even if they don't have the same version\n" + " --exec CMDS Immediately executes the semicolon separated CLI commands\n" + " and then exits.\n" + " --no-status Disables the initial status check done when starting\n" + " the CLI.\n" #ifndef TLS_DISABLED - TLS_HELP + TLS_HELP #endif - " -v, --version Print FoundationDB CLI version information and exit.\n" - " -h, --help Display this help and exit.\n"); + " -v, --version Print FoundationDB CLI version information and exit.\n" + " -h, --help Display this help and exit.\n"); } @@ -2332,6 +2349,7 @@ struct CLIOptions { bool trace; std::string traceDir; std::string traceFormat; + bool useObjectSerializer = false; int exit_timeout; Optional exec; bool initialStatusCheck; @@ -2403,41 +2421,55 @@ struct CLIOptions { #ifndef TLS_DISABLED // TLS Options - case TLSOptions::OPT_TLS_PLUGIN: - args.OptionArg(); - break; - case TLSOptions::OPT_TLS_CERTIFICATES: - tlsCertPath = args.OptionArg(); - break; - case TLSOptions::OPT_TLS_CA_FILE: - tlsCAPath = args.OptionArg(); - break; - case TLSOptions::OPT_TLS_KEY: - tlsKeyPath = args.OptionArg(); - break; - case TLSOptions::OPT_TLS_PASSWORD: - tlsPassword = args.OptionArg(); - break; - case TLSOptions::OPT_TLS_VERIFY_PEERS: - tlsVerifyPeers = args.OptionArg(); - break; + case TLSOptions::OPT_TLS_PLUGIN: + args.OptionArg(); + break; + case TLSOptions::OPT_TLS_CERTIFICATES: + tlsCertPath = args.OptionArg(); + break; + case TLSOptions::OPT_TLS_CA_FILE: + tlsCAPath = args.OptionArg(); + break; + case TLSOptions::OPT_TLS_KEY: + tlsKeyPath = args.OptionArg(); + break; + case TLSOptions::OPT_TLS_PASSWORD: + tlsPassword = args.OptionArg(); + break; + case TLSOptions::OPT_TLS_VERIFY_PEERS: + tlsVerifyPeers = args.OptionArg(); + break; #endif - case OPT_HELP: - printProgramUsage(program_name.c_str()); - return 0; - case OPT_STATUS_FROM_JSON: - return printStatusFromJSON(args.OptionArg()); - case OPT_TRACE_FORMAT: - if (!validateTraceFormat(args.OptionArg())) { - fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); - } - traceFormat = args.OptionArg(); - break; - case OPT_VERSION: - printVersion(); - return FDB_EXIT_SUCCESS; - } - return -1; + case OPT_HELP: + printProgramUsage(program_name.c_str()); + return 0; + case OPT_STATUS_FROM_JSON: + return printStatusFromJSON(args.OptionArg()); + case OPT_TRACE_FORMAT: + if (!validateTraceFormat(args.OptionArg())) { + fprintf(stderr, "WARNING: Unrecognized trace format `%s'\n", args.OptionArg()); + } + traceFormat = args.OptionArg(); + break; + case OPT_USE_OBJECT_SERIALIZER: { + std::string s = args.OptionArg(); + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + if (s == "on" || s == "true" || s == "1") { + useObjectSerializer = true; + } else if (s == "off" || s == "false" || s == "0") { + useObjectSerializer = false; + } else { + fprintf(stderr, "ERROR: Could not parse object serializer option: `%s'\n", s.c_str()); + printProgramUsage(program_name.c_str()); + flushAndExit(FDB_EXIT_ERROR); + } + break; + } + case OPT_VERSION: + printVersion(); + return FDB_EXIT_SUCCESS; + } + return -1; } }; @@ -3490,6 +3522,11 @@ int main(int argc, char **argv) { } setNetworkOption(FDBNetworkOptions::ENABLE_SLOW_TASK_PROFILING); } + // The USE_OBJECT_SERIALIZER network option expects an 8 byte little endian integer which is interpreted as zero = + // false, non-zero = true. + setNetworkOption(FDBNetworkOptions::USE_OBJECT_SERIALIZER, + opt.useObjectSerializer ? LiteralStringRef("\x01\x00\x00\x00\x00\x00\x00\x00") + : LiteralStringRef("\x00\x00\x00\x00\x00\x00\x00\x00")); initHelp(); diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 1de08c64f8..25bc58c71d 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -419,7 +419,7 @@ ACTOR Future readCommitted(Database cx, PromiseStreamtake(TaskDefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT)); + wait(lock->take(TaskPriority::DefaultYield, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT)); releaser = FlowLock::Releaser(*lock, limits.bytes + CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->SYSTEM_KEY_SIZE_LIMIT); state Standalone values = wait(tr.getRange(begin, end, limits)); @@ -495,7 +495,7 @@ ACTOR Future readCommitted(Database cx, PromiseStream results, Fu //add lock wait(active); releaser.release(); - wait(lock->take(TaskDefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize())); + wait(lock->take(TaskPriority::DefaultYield, rangevalue.expectedSize() + rcGroup.items.expectedSize())); releaser = FlowLock::Releaser(*lock, rangevalue.expectedSize() + rcGroup.items.expectedSize()); for (auto & s : rangevalue){ @@ -613,7 +613,7 @@ ACTOR Future dumpData(Database cx, PromiseStream results, Referenc req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; totalBytes += mutationSize; - wait( commitLock->take(TaskDefaultYield, mutationSize) ); + wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) ); addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) ); if(endOfStream) { @@ -653,7 +653,7 @@ ACTOR Future coalesceKeyVersionCache(Key uid, Version endVersion, Referenc req.transaction.read_snapshot = committedVersion->get(); req.flags = req.flags | CommitTransactionRequest::FLAG_IS_LOCK_AWARE; - wait( commitLock->take(TaskDefaultYield, mutationSize) ); + wait( commitLock->take(TaskPriority::DefaultYield, mutationSize) ); addActor.send( commitLock->releaseWhen( success(commit.getReply(req)), mutationSize ) ); } @@ -671,7 +671,7 @@ ACTOR Future applyMutations(Database cx, Key uid, Key addPrefix, Key remov try { loop { if(beginVersion >= *endVersion) { - wait( commitLock.take(TaskDefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) ); + wait( commitLock.take(TaskPriority::DefaultYield, CLIENT_KNOBS->BACKUP_LOCK_BYTES) ); commitLock.release(CLIENT_KNOBS->BACKUP_LOCK_BYTES); if(beginVersion >= *endVersion) { return Void(); diff --git a/fdbclient/ClusterInterface.h b/fdbclient/ClusterInterface.h index bb51ce74f2..5e17807c4d 100644 --- a/fdbclient/ClusterInterface.h +++ b/fdbclient/ClusterInterface.h @@ -52,12 +52,12 @@ struct ClusterInterface { } void initEndpoints() { - openDatabase.getEndpoint( TaskClusterController ); - failureMonitoring.getEndpoint( TaskFailureMonitor ); - databaseStatus.getEndpoint( TaskClusterController ); - ping.getEndpoint( TaskClusterController ); - getClientWorkers.getEndpoint( TaskClusterController ); - forceRecovery.getEndpoint( TaskClusterController ); + openDatabase.getEndpoint( TaskPriority::ClusterController ); + failureMonitoring.getEndpoint( TaskPriority::FailureMonitor ); + databaseStatus.getEndpoint( TaskPriority::ClusterController ); + ping.getEndpoint( TaskPriority::ClusterController ); + getClientWorkers.getEndpoint( TaskPriority::ClusterController ); + forceRecovery.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index eca185e8f8..4c1c21dc6a 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -54,7 +54,7 @@ public: // For internal (fdbserver) use only static Database create( Reference>> clusterInterface, Reference connFile, LocalityData const& clientLocality ); - static Database create( Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID=TaskDefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST ); + static Database create( Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID=TaskPriority::DefaultEndpoint, bool lockAware=false, int apiVersion=Database::API_VERSION_LATEST ); ~DatabaseContext(); @@ -97,7 +97,7 @@ public: //private: explicit DatabaseContext( Reference cluster, Reference> clientDBInfo, - Future clientInfoMonitor, Standalone dbId, int taskID, LocalityData const& clientLocality, + Future clientInfoMonitor, Standalone dbId, TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion = Database::API_VERSION_LATEST ); explicit DatabaseContext( const Error &err ); @@ -162,7 +162,7 @@ public: Future logger; - int taskID; + TaskPriority taskID; Int64MetricHandle getValueSubmitted; EventMetricHandle getValueCompleted; diff --git a/fdbclient/FailureMonitorClient.actor.cpp b/fdbclient/FailureMonitorClient.actor.cpp index 3be7a4dccd..7cb1a3144e 100644 --- a/fdbclient/FailureMonitorClient.actor.cpp +++ b/fdbclient/FailureMonitorClient.actor.cpp @@ -41,7 +41,7 @@ ACTOR Future failureMonitorClientLoop( { state Version version = 0; state Future request = Never(); - state Future nextRequest = delay(0, TaskFailureMonitor); + state Future nextRequest = delay(0, TaskPriority::FailureMonitor); state Future requestTimeout = Never(); state double before = now(); state double waitfor = 0; @@ -61,7 +61,7 @@ ACTOR Future failureMonitorClientLoop( loop { choose { when( FailureMonitoringReply reply = wait( request ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); request = Never(); requestTimeout = Never(); if (reply.allOthersFailed) { @@ -122,10 +122,10 @@ ACTOR Future failureMonitorClientLoop( } before = now(); waitfor = reply.clientRequestIntervalMS * .001; - nextRequest = delayJittered( waitfor, TaskFailureMonitor ); + nextRequest = delayJittered( waitfor, TaskPriority::FailureMonitor ); } when( wait( requestTimeout ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); requestTimeout = Never(); TraceEvent(SevWarn, "FailureMonitoringServerDown").detail("OldServerID",controller.id()); monitor->setStatus(controlAddr.address, FailureStatus(true)); @@ -136,7 +136,7 @@ ACTOR Future failureMonitorClientLoop( } } when( wait( nextRequest ) ) { - g_network->setCurrentTask(TaskDefaultDelay); + g_network->setCurrentTask(TaskPriority::DefaultDelay); nextRequest = Never(); double elapsed = now() - before; @@ -152,9 +152,9 @@ ACTOR Future failureMonitorClientLoop( req.addresses = g_network->getLocalAddresses(); if (trackMyStatus) req.senderStatus = FailureStatus(false); - request = controller.failureMonitoring.getReply( req, TaskFailureMonitor ); + request = controller.failureMonitoring.getReply( req, TaskPriority::FailureMonitor ); if(!controller.failureMonitoring.getEndpoint().isLocal()) - requestTimeout = delay( fmState->serverFailedTimeout, TaskFailureMonitor ); + requestTimeout = delay( fmState->serverFailedTimeout, TaskPriority::FailureMonitor ); } } } diff --git a/fdbclient/HTTP.actor.cpp b/fdbclient/HTTP.actor.cpp index 00cece10a1..5893588406 100644 --- a/fdbclient/HTTP.actor.cpp +++ b/fdbclient/HTTP.actor.cpp @@ -93,7 +93,7 @@ namespace HTTP { loop { // Wait for connection to have something to read wait(conn->onReadable()); - wait( delay( 0, TaskReadSocket ) ); + wait( delay( 0, TaskPriority::ReadSocket ) ); // Read into buffer int originalSize = buf->size(); @@ -353,7 +353,7 @@ namespace HTTP { loop { wait(conn->onWritable()); - wait( delay( 0, TaskWriteSocket ) ); + wait( delay( 0, TaskPriority::WriteSocket ) ); // If we already got a response, before finishing sending the request, then close the connection, // set the Connection header to "close" as a hint to the caller that this connection can't be used diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index a371ac2624..afc64d62c2 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -967,7 +967,7 @@ ACTOR Future changeQuorum( Database cx, Reference>> leaderServers; ClientCoordinators coord( Reference( new ClusterConnectionFile( conn ) ) ); for( int i = 0; i < coord.clientLeaderServers.size(); i++ ) - leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) ); + leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) ); choose { when( wait( waitForAll( leaderServers ) ) ) {} @@ -1047,7 +1047,7 @@ struct AutoQuorumChange : IQuorumChange { ClientCoordinators coord(ccf); vector>> leaderServers; for( int i = 0; i < coord.clientLeaderServers.size(); i++ ) - leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskCoordinationReply ) ); + leaderServers.push_back( retryBrokenPromise( coord.clientLeaderServers[i].getLeader, GetLeaderRequest( coord.clusterKey, UID() ), TaskPriority::CoordinationReply ) ); Optional>> results = wait( timeout( getAll(leaderServers), CLIENT_KNOBS->IS_ACCEPTABLE_DELAY ) ); if (!results.present()) return false; // Not all responded for(auto& r : results.get()) diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index 9b65ec572c..dea0d8b797 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -67,10 +67,10 @@ struct MasterProxyInterface { } void initEndpoints() { - getConsistentReadVersion.getEndpoint(TaskProxyGetConsistentReadVersion); - getRawCommittedVersion.getEndpoint(TaskProxyGetRawCommittedVersion); - commit.getEndpoint(TaskProxyCommitDispatcher); - getStorageServerRejoinInfo.getEndpoint(TaskProxyStorageRejoin); + getConsistentReadVersion.getEndpoint(TaskPriority::ProxyGetConsistentReadVersion); + getRawCommittedVersion.getEndpoint(TaskPriority::ProxyGetRawCommittedVersion); + commit.getEndpoint(TaskPriority::ProxyCommitDispatcher); + getStorageServerRejoinInfo.getEndpoint(TaskPriority::ProxyStorageRejoin); //getKeyServersLocations.getEndpoint(TaskProxyGetKeyServersLocations); //do not increase the priority of these requests, because clients cans bring down the cluster with too many of these messages. } }; diff --git a/fdbclient/MonitorLeader.actor.cpp b/fdbclient/MonitorLeader.actor.cpp index 6210eb8810..b066b03b13 100644 --- a/fdbclient/MonitorLeader.actor.cpp +++ b/fdbclient/MonitorLeader.actor.cpp @@ -371,7 +371,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( NetworkAddress remote ) } ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) { - getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskCoordination ); + getLeader.makeWellKnownEndpoint( WLTOKEN_CLIENTLEADERREG_GETLEADER, TaskPriority::Coordination ); } // Nominee is the worker among all workers that are considered as leader by a coordinator @@ -380,7 +380,7 @@ ClientLeaderRegInterface::ClientLeaderRegInterface( INetwork* local ) { ACTOR Future monitorNominee( Key key, ClientLeaderRegInterface coord, AsyncTrigger* nomineeChange, Optional *info, int generation, Reference> connectedCoordinatorsNum ) { state bool hasCounted = false; loop { - state Optional li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskCoordinationReply ) ); + state Optional li = wait( retryBrokenPromise( coord.getLeader, GetLeaderRequest( key, info->present() ? info->get().changeID : UID() ), TaskPriority::CoordinationReply ) ); if (li.present() && !hasCounted && connectedCoordinatorsNum.isValid()) { connectedCoordinatorsNum->set(connectedCoordinatorsNum->get() + 1); hasCounted = true; diff --git a/fdbclient/MultiVersionTransaction.actor.cpp b/fdbclient/MultiVersionTransaction.actor.cpp index c2ef5ad1b6..3fb0f675d1 100644 --- a/fdbclient/MultiVersionTransaction.actor.cpp +++ b/fdbclient/MultiVersionTransaction.actor.cpp @@ -588,7 +588,20 @@ ThreadFuture MultiVersionTransaction::onError(Error const& e) { else { auto tr = getTransaction(); auto f = tr.transaction ? tr.transaction->onError(e) : ThreadFuture(Never()); - return abortableFuture(f, tr.onChange); + f = abortableFuture(f, tr.onChange); + + return flatMapThreadFuture(f, [this, e](ErrorOr ready) { + if(!ready.isError() || ready.getError().code() != error_code_cluster_version_changed) { + if(ready.isError()) { + return ErrorOr>(ready.getError()); + } + + return ErrorOr>(Void()); + } + + updateTransaction(); + return ErrorOr>(onError(e)); + }); } } diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6cea540491..8ada99503f 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -511,7 +511,7 @@ Future DatabaseContext::getHealthMetrics(bool detailed = false) { DatabaseContext::DatabaseContext( Reference cluster, Reference> clientInfo, Future clientInfoMonitor, Standalone dbId, - int taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) + TaskPriority taskID, LocalityData const& clientLocality, bool enableLocalityLoadBalance, bool lockAware, int apiVersion ) : cluster(cluster), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), dbId(dbId), taskID(taskID), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), lockAware(lockAware), apiVersion(apiVersion), provisional(false), transactionReadVersions(0), transactionLogicalReads(0), transactionPhysicalReads(0), transactionCommittedMutations(0), transactionCommittedMutationBytes(0), @@ -632,10 +632,10 @@ Database DatabaseContext::create(Reference>> Reference> clientInfo(new AsyncVar()); Future clientInfoMonitor = delayedAsyncVar(connectedCoordinatorsNum, connectedCoordinatorsNumDelayed, CLIENT_KNOBS->CHECK_CONNECTED_COORDINATOR_NUM_DELAY) || monitorClientInfo(clusterInterface, connFile, clientInfo, connectedCoordinatorsNumDelayed); - return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false)); + return Database(new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false)); } -Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, int taskID, bool lockAware, int apiVersion) { +Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, bool enableLocalityLoadBalance, TaskPriority taskID, bool lockAware, int apiVersion) { return Database( new DatabaseContext( Reference(nullptr), clientInfo, clientInfoMonitor, LiteralStringRef(""), taskID, clientLocality, enableLocalityLoadBalance, lockAware, apiVersion ) ); } @@ -825,10 +825,10 @@ Database Database::createDatabase( Reference connFile, in DatabaseContext *db; if(preallocatedDb) { - db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion); + db = new (preallocatedDb) DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); } else { - db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskDefaultEndpoint, clientLocality, true, false, apiVersion); + db = new DatabaseContext(cluster, clientInfo, clientInfoMonitor, LiteralStringRef(""), TaskPriority::DefaultEndpoint, clientLocality, true, false, apiVersion); } return Database(db); @@ -884,7 +884,7 @@ void Cluster::init( Reference connFile, bool startClientI initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP))); systemMonitor(); - uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskFlushTrace ) ); + uncancellable( recurring( &systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace ) ); } failMon = failureMonitorClient( clusterInterface, false ); @@ -1240,7 +1240,7 @@ ACTOR Future< pair> > getKeyLocation_internal( loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskDefaultPromiseEndpoint ) ) ) { + when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT( rep.results.size() == 1 ); @@ -1277,7 +1277,7 @@ ACTOR Future< vector< pair> > > getKeyRangeLoca loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskDefaultPromiseEndpoint ) ) ) { + when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { state GetKeyServerLocationsReply rep = _rep; if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After"); @@ -1398,7 +1398,7 @@ ACTOR Future> getValue( Future version, Key key, Databa } state GetValueReply reply = wait( loadBalance(ssi.second, &StorageServerInterface::getValue, GetValueRequest(key, ver, getValueID), - TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL)); + TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL)); double latency = now() - startTimeD; cx->readLatencies.addSample(latency); if (trLogInfo) { @@ -1461,7 +1461,7 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); ++cx->transactionPhysicalReads; - GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyReply reply = wait( loadBalance( ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get()), TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; @@ -1524,7 +1524,7 @@ ACTOR Future< Void > watchValue( Future version, Key key, OptionalgetCurrentTask()); } - state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskDefaultPromiseEndpoint ) ); + state Version resp = wait( loadBalance( ssi.second, &StorageServerInterface::watchValue, WatchValueRequest(key, value, ver, watchValueID), TaskPriority::DefaultPromiseEndpoint ) ); if( info.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask()); } @@ -1616,7 +1616,7 @@ ACTOR Future> getExactRange( Database cx, Version ver .detail("Servers", locations[shard].second->description());*/ } ++cx->transactionPhysicalReads; - GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskDefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyValuesReply rep = wait( loadBalance( locations[shard].second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After"); output.arena().dependsOn( rep.arena ); @@ -1893,7 +1893,7 @@ ACTOR Future> getRange( Database cx, ReferenceenableLocalityLoadBalance ? &cx->queueModel : NULL ) ); + GetKeyValuesReply rep = wait( loadBalance(beginServer.second, &StorageServerInterface::getKeyValues, req, TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : NULL ) ); if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getRange.After");//.detail("SizeOf", rep.data.size()); @@ -2694,7 +2694,7 @@ ACTOR static Future tryCommit( Database cx, Reference const std::vector& proxies = cx->clientInfo->get().proxies; reply = proxies.size() ? throwErrorOr ( brokenPromiseToMaybeDelivered ( proxies[0].commit.tryGetReply(req) ) ) : Never(); } else { - reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskDefaultPromiseEndpoint, true ); + reply = loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::commit, req, TaskPriority::DefaultPromiseEndpoint, true ); } choose { @@ -3074,7 +3074,7 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStream< std::p if (requests.size() == CLIENT_KNOBS->MAX_BATCH_SIZE) send_batch = true; else if (!timeout.isValid()) - timeout = delay(batchTime, TaskProxyGetConsistentReadVersion); + timeout = delay(batchTime, TaskPriority::ProxyGetConsistentReadVersion); } when(wait(timeout.isValid() ? timeout : Never())) { send_batch = true; @@ -3235,7 +3235,7 @@ ACTOR Future< StorageMetrics > waitStorageMetricsMultipleLocations( WaitMetricsRequest req(locations[i].first, StorageMetrics(), StorageMetrics()); req.min.bytes = 0; req.max.bytes = -1; - fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution ); + fx[i] = loadBalance( locations[i].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution ); } wait( waitForAll(fx) ); @@ -3266,7 +3266,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( int shardLimit ) { loop { - vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskDataDistribution) ) ); + vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, shardLimit, false, &StorageServerInterface::waitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) ); //SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this. if(locations.size() < shardLimit) { @@ -3276,7 +3276,7 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( fx = waitStorageMetricsMultipleLocations( locations, min, max, permittedError ); } else { WaitMetricsRequest req( keys, min, max ); - fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskDataDistribution ); + fx = loadBalance( locations[0].second, &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution ); } StorageMetrics x = wait(fx); return x; @@ -3286,14 +3286,14 @@ ACTOR Future< StorageMetrics > waitStorageMetrics( throw; } cx->invalidateCache(keys); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } else { TraceEvent(SevWarn, "WaitStorageMetricsPenalty") .detail("Keys", keys) .detail("Limit", CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) .detail("JitteredSecondsOfPenitence", CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY); - wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution)); + wait(delayJittered(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); // make sure that the next getKeyRangeLocations() call will actually re-fetch the range cx->invalidateCache( keys ); } @@ -3319,13 +3319,13 @@ Future< StorageMetrics > Transaction::getStorageMetrics( KeyRange const& keys, i ACTOR Future< Standalone> > splitStorageMetrics( Database cx, KeyRange keys, StorageMetrics limit, StorageMetrics estimated ) { loop { - state vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskDataDistribution) ) ); + state vector< pair> > locations = wait( getKeyRangeLocations( cx, keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT, false, &StorageServerInterface::splitMetrics, TransactionInfo(TaskPriority::DataDistribution) ) ); state StorageMetrics used; state Standalone> results; //SOMEDAY: Right now, if there are too many shards we delay and check again later. There may be a better solution to this. if(locations.size() == CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT) { - wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->STORAGE_METRICS_TOO_MANY_SHARDS_DELAY, TaskPriority::DataDistribution)); cx->invalidateCache(keys); } else { @@ -3336,7 +3336,7 @@ ACTOR Future< Standalone> > splitStorageMetrics( Database cx, state int i = 0; for(; i> > splitStorageMetrics( Database cx, throw; } cx->invalidateCache( keys ); - wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskDataDistribution)); + wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution)); } } } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 0fbf76cfe4..b7c3aa6d71 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -163,10 +163,10 @@ struct TransactionOptions { struct TransactionInfo { Optional debugID; - int taskID; + TaskPriority taskID; bool useProvisionalProxies; - explicit TransactionInfo( int taskID ) : taskID(taskID), useProvisionalProxies(false) {} + explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {} }; struct TransactionLogInfo : public ReferenceCounted, NonCopyable { @@ -286,7 +286,7 @@ public: void flushTrLogsIfEnabled(); // These are to permit use as state variables in actors: - Transaction() : info( TaskDefaultEndpoint ) {} + Transaction() : info( TaskPriority::DefaultEndpoint ) {} void operator=(Transaction&& r) BOOST_NOEXCEPT; void reset(); diff --git a/fdbclient/StatusClient.actor.cpp b/fdbclient/StatusClient.actor.cpp index d4b06a5182..8e706987a9 100644 --- a/fdbclient/StatusClient.actor.cpp +++ b/fdbclient/StatusClient.actor.cpp @@ -291,7 +291,7 @@ ACTOR Future> clientCoordinatorsStatusFetcher(Reference>> leaderServers; for (int i = 0; i < coord.clientLeaderServers.size(); i++) - leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskCoordinationReply)); + leaderServers.push_back(retryBrokenPromise(coord.clientLeaderServers[i].getLeader, GetLeaderRequest(coord.clusterKey, UID()), TaskPriority::CoordinationReply)); wait( smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.5) || delay(2.0) ); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 51fccd9b87..721654a462 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -80,9 +80,9 @@ struct StorageServerInterface { bool operator == (StorageServerInterface const& s) const { return uniqueID == s.uniqueID; } bool operator < (StorageServerInterface const& s) const { return uniqueID < s.uniqueID; } void initEndpoints() { - getValue.getEndpoint( TaskLoadBalancedEndpoint ); - getKey.getEndpoint( TaskLoadBalancedEndpoint ); - getKeyValues.getEndpoint( TaskLoadBalancedEndpoint ); + getValue.getEndpoint( TaskPriority::LoadBalancedEndpoint ); + getKey.getEndpoint( TaskPriority::LoadBalancedEndpoint ); + getKeyValues.getEndpoint( TaskPriority::LoadBalancedEndpoint ); } }; diff --git a/fdbclient/VersionedMap.actor.h b/fdbclient/VersionedMap.actor.h index cfb9e650f6..53ba85097f 100644 --- a/fdbclient/VersionedMap.actor.h +++ b/fdbclient/VersionedMap.actor.h @@ -31,7 +31,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. ACTOR template -Future deferredCleanupActor( std::vector toFree, int taskID = 7000 ) { +Future deferredCleanupActor( std::vector toFree, TaskPriority taskID = TaskPriority::DefaultYield ) { state int freeCount = 0; while (!toFree.empty()) { Tree a = std::move( toFree.back() ); diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h index 705108ce72..f56b883892 100644 --- a/fdbclient/VersionedMap.h +++ b/fdbclient/VersionedMap.h @@ -511,7 +511,7 @@ public: oldestVersion = newOldestVersion; } - Future forgetVersionsBeforeAsync( Version newOldestVersion, int taskID = 7000 ) { + Future forgetVersionsBeforeAsync( Version newOldestVersion, TaskPriority taskID = TaskPriority::DefaultYield ) { ASSERT( newOldestVersion <= latestVersion ); roots[newOldestVersion] = getRoot(newOldestVersion); diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index 12ca1866ad..f786266888 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -266,7 +266,7 @@ private: } ACTOR static Future read_impl( int fd, void* data, int length, int64_t offset ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; //fprintf(stderr, "eio_read (fd=%d length=%d offset=%lld)\n", fd, length, offset); state eio_req* r = eio_read(fd, data, length, offset, 0, eio_callback, &p); @@ -289,7 +289,7 @@ private: } ACTOR static Future write_impl( int fd, Reference err, StringRef data, int64_t offset ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_write(fd, (void*)data.begin(), data.size(), offset, 0, eio_callback, &p); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -299,7 +299,7 @@ private: } ACTOR static Future truncate_impl( int fd, Reference err, int64_t size ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_ftruncate(fd, size, 0, eio_callback, &p); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -330,7 +330,7 @@ private: } ACTOR static Future sync_impl( int fd, Reference err, bool sync_metadata=false ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = start_fsync( fd, p, sync_metadata ); @@ -350,7 +350,7 @@ private: } ACTOR static Future size_impl( int fd ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state eio_req* r = eio_fstat( fd, 0, eio_callback, &p ); try { wait( p.getFuture() ); } catch (...) { g_network->setCurrentTask( taskID ); eio_cancel(r); throw; } @@ -363,7 +363,7 @@ private: } ACTOR static Future stat_impl( std::string filename ) { - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state Promise p; state EIO_STRUCT_STAT statdata; state eio_req* r = eio_stat( filename.c_str(), 0, eio_callback, &p ); @@ -377,7 +377,7 @@ private: ACTOR template static Future dispatch_impl( std::function func) { state Dispatch data( func ); - state int taskID = g_network->getCurrentTask(); + state TaskPriority taskID = g_network->getCurrentTask(); state eio_req* r = eio_custom( [](eio_req* req) { // Runs on the eio thread pool @@ -418,7 +418,7 @@ private: static void eio_want_poll() { want_poll = 1; // SOMEDAY: NULL for deferred error, no analysis of correctness (itp) - onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPollEIO); + onMainThreadVoid([](){ poll_eio(); }, NULL, TaskPriority::PollEIO); } static int eio_callback( eio_req* req ) { diff --git a/fdbrpc/AsyncFileKAIO.actor.h b/fdbrpc/AsyncFileKAIO.actor.h index ac66605be3..14495a6cdf 100644 --- a/fdbrpc/AsyncFileKAIO.actor.h +++ b/fdbrpc/AsyncFileKAIO.actor.h @@ -472,9 +472,9 @@ private: #endif } - int getTask() const { return (prio>>32)+1; } + TaskPriority getTask() const { return static_cast((prio>>32)+1); } - ACTOR static void deliver( Promise result, bool failed, int r, int task ) { + ACTOR static void deliver( Promise result, bool failed, int r, TaskPriority task ) { wait( delay(0, task) ); if (failed) result.sendError(io_timeout()); else if (r < 0) result.sendError(io_error()); @@ -649,7 +649,7 @@ private: loop { wait(success(ev->read())); - wait(delay(0, TaskDiskIOComplete)); + wait(delay(0, TaskPriority::DiskIOComplete)); linux_ioresult ev[FLOW_KNOBS->MAX_OUTSTANDING]; timespec tm; tm.tv_sec = 0; tm.tv_nsec = 0; diff --git a/fdbrpc/AsyncFileNonDurable.actor.cpp b/fdbrpc/AsyncFileNonDurable.actor.cpp index a3257f1fa8..6ea0129a27 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.cpp +++ b/fdbrpc/AsyncFileNonDurable.actor.cpp @@ -23,13 +23,13 @@ std::map> AsyncFileNonDurable::filesBeingDeleted; -ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, int taskID ) { +ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID ) { wait( g_simulator.onProcess( process, taskID ) ); promise.send(Void()); return Void(); } -ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, int taskID ) { +ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, TaskPriority taskID ) { wait( g_simulator.onProcess( process, taskID ) ); promise.sendError(e); return Void(); diff --git a/fdbrpc/AsyncFileNonDurable.actor.h b/fdbrpc/AsyncFileNonDurable.actor.h index 03fe8e852c..7e8e551b3e 100644 --- a/fdbrpc/AsyncFileNonDurable.actor.h +++ b/fdbrpc/AsyncFileNonDurable.actor.h @@ -38,8 +38,8 @@ #undef max #undef min -Future sendOnProcess( ISimulator::ProcessInfo* const& process, Promise const& promise, int const& taskID ); -Future sendErrorOnProcess( ISimulator::ProcessInfo* const& process, Promise const& promise, Error const& e, int const& taskID ); +ACTOR Future sendOnProcess( ISimulator::ProcessInfo* process, Promise promise, TaskPriority taskID ); +ACTOR Future sendErrorOnProcess( ISimulator::ProcessInfo* process, Promise promise, Error e, TaskPriority taskID ); ACTOR template Future sendErrorOnShutdown( Future in ) { @@ -198,7 +198,7 @@ public: //Creates a new AsyncFileNonDurable which wraps the provided IAsyncFile ACTOR static Future> open(std::string filename, std::string actualFilename, Future> wrappedFile, Reference diskParameters) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); state Future shutdown = success(currentProcess->shutdownSignal.getFuture()); //TraceEvent("AsyncFileNonDurableOpenBegin").detail("Filename", filename).detail("Addr", g_simulator.getCurrentProcess()->address); @@ -391,7 +391,7 @@ private: ACTOR Future read(AsyncFileNonDurable *self, void *data, int length, int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { @@ -411,7 +411,7 @@ private: //or none of the write. It may also corrupt parts of sectors which have not been written correctly ACTOR Future write(AsyncFileNonDurable *self, Promise writeStarted, Future> ownFuture, void const* data, int length, int64_t offset) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; @@ -535,7 +535,7 @@ private: //If a kill interrupts the delay, then the truncate may or may not be performed ACTOR Future truncate(AsyncFileNonDurable *self, Promise truncateStarted, Future> ownFuture, int64_t size) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); state double delayDuration = deterministicRandom()->random01() * self->maxWriteDelay; @@ -573,8 +573,8 @@ private: } } - if(g_network->check_yield(TaskDefaultYield)) { - wait(delay(0, TaskDefaultYield)); + if(g_network->check_yield(TaskPriority::DefaultYield)) { + wait(delay(0, TaskPriority::DefaultYield)); } //If performing a durable truncate, then pass it through to the file. Otherwise, pass it through with a 1/2 chance @@ -663,7 +663,7 @@ private: ACTOR Future sync(AsyncFileNonDurable *self, bool durable) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { @@ -695,7 +695,7 @@ private: ACTOR Future size(AsyncFileNonDurable *self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); @@ -714,7 +714,7 @@ private: //Finishes all outstanding actors on an AsyncFileNonDurable and then deletes it ACTOR Future deleteFile(AsyncFileNonDurable *self) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); state std::string filename = self->filename; wait( g_simulator.onMachine( currentProcess ) ); diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index a44b588800..4934c3ea4a 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -172,28 +172,28 @@ struct YieldMockNetwork : INetwork, ReferenceCounted { t.send(Void()); } - virtual Future delay(double seconds, int taskID) { + virtual Future delay(double seconds, TaskPriority taskID) { return nextTick.getFuture(); } - virtual Future yield(int taskID) { + virtual Future yield(TaskPriority taskID) { if (check_yield(taskID)) return delay(0,taskID); return Void(); } - virtual bool check_yield(int taskID) { + virtual bool check_yield(TaskPriority taskID) { if (nextYield > 0) --nextYield; return nextYield == 0; } // Delegate everything else. TODO: Make a base class NetworkWrapper for delegating everything in INetwork - virtual int getCurrentTask() { return baseNetwork->getCurrentTask(); } - virtual void setCurrentTask(int taskID) { baseNetwork->setCurrentTask(taskID); } + virtual TaskPriority getCurrentTask() { return baseNetwork->getCurrentTask(); } + virtual void setCurrentTask(TaskPriority taskID) { baseNetwork->setCurrentTask(taskID); } virtual double now() { return baseNetwork->now(); } virtual void stop() { return baseNetwork->stop(); } virtual bool isSimulated() const { return baseNetwork->isSimulated(); } - virtual void onMainThread(Promise&& signal, int taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); } + virtual void onMainThread(Promise&& signal, TaskPriority taskID) { return baseNetwork->onMainThread(std::move(signal), taskID); } bool isOnMainThread() const override { return baseNetwork->isOnMainThread(); } virtual THREAD_HANDLE startThread(THREAD_FUNC_RETURN(*func) (void *), void *arg) { return baseNetwork->startThread(func,arg); } virtual Future< Reference > open(std::string filename, int64_t flags, int64_t mode) { return IAsyncFileSystem::filesystem()->open(filename,flags,mode); } diff --git a/fdbrpc/FlowTransport.actor.cpp b/fdbrpc/FlowTransport.actor.cpp index c559e269e6..c20aa607a6 100644 --- a/fdbrpc/FlowTransport.actor.cpp +++ b/fdbrpc/FlowTransport.actor.cpp @@ -50,9 +50,9 @@ const uint64_t TOKEN_STREAM_FLAG = 1; class EndpointMap : NonCopyable { public: EndpointMap(); - void insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority ); + void insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority ); NetworkMessageReceiver* get( Endpoint::Token const& token ); - uint32_t getPriority( Endpoint::Token const& token ); + TaskPriority getPriority( Endpoint::Token const& token ); void remove( Endpoint::Token const& token, NetworkMessageReceiver* r ); private: @@ -86,12 +86,12 @@ void EndpointMap::realloc() { firstFree = oldSize; } -void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, uint32_t priority ) { +void EndpointMap::insert( NetworkMessageReceiver* r, Endpoint::Token& token, TaskPriority priority ) { if (firstFree == uint32_t(-1)) realloc(); int index = firstFree; firstFree = data[index].nextFree; token = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | index ); - data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | priority ); + data[index].token() = Endpoint::Token( token.first(), (token.second()&0xffffffff00000000LL) | static_cast(priority) ); data[index].receiver = r; } @@ -102,11 +102,11 @@ NetworkMessageReceiver* EndpointMap::get( Endpoint::Token const& token ) { return 0; } -uint32_t EndpointMap::getPriority( Endpoint::Token const& token ) { +TaskPriority EndpointMap::getPriority( Endpoint::Token const& token ) { uint32_t index = token.second(); if ( index < data.size() && data[index].token().first() == token.first() && ((data[index].token().second()&0xffffffff00000000LL)|index)==token.second() ) - return data[index].token().second(); - return TaskUnknownEndpoint; + return static_cast(data[index].token().second()); + return TaskPriority::UnknownEndpoint; } void EndpointMap::remove( Endpoint::Token const& token, NetworkMessageReceiver* r ) { @@ -122,7 +122,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver { EndpointNotFoundReceiver(EndpointMap& endpoints) { //endpoints[WLTOKEN_ENDPOINT_NOT_FOUND] = this; Endpoint::Token e = WLTOKEN_ENDPOINT_NOT_FOUND; - endpoints.insert(this, e, TaskDefaultEndpoint); + endpoints.insert(this, e, TaskPriority::DefaultEndpoint); ASSERT( e == WLTOKEN_ENDPOINT_NOT_FOUND ); } virtual void receive( ArenaReader& reader ) { @@ -141,7 +141,7 @@ struct EndpointNotFoundReceiver : NetworkMessageReceiver { struct PingReceiver : NetworkMessageReceiver { PingReceiver(EndpointMap& endpoints) { Endpoint::Token e = WLTOKEN_PING_PACKET; - endpoints.insert(this, e, TaskReadSocket); + endpoints.insert(this, e, TaskPriority::ReadSocket); ASSERT( e == WLTOKEN_PING_PACKET ); } virtual void receive( ArenaReader& reader ) { @@ -438,10 +438,10 @@ struct Peer : NonCopyable { ACTOR static Future connectionWriter( Peer* self, Reference conn ) { state double lastWriteTime = now(); loop { - //wait( delay(0, TaskWriteSocket) ); - wait( delayJittered(std::max(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskWriteSocket) ); - //wait( delay(500e-6, TaskWriteSocket) ); - //wait( yield(TaskWriteSocket) ); + //wait( delay(0, TaskPriority::WriteSocket) ); + wait( delayJittered(std::max(FLOW_KNOBS->MIN_COALESCE_DELAY, FLOW_KNOBS->MAX_COALESCE_DELAY - (now() - lastWriteTime)), TaskPriority::WriteSocket) ); + //wait( delay(500e-6, TaskPriority::WriteSocket) ); + //wait( yield(TaskPriority::WriteSocket) ); // Send until there is nothing left to send loop { @@ -456,7 +456,7 @@ struct Peer : NonCopyable { TEST(true); // We didn't write everything, so apparently the write buffer is full. Wait for it to be nonfull. wait( conn->onWritable() ); - wait( yield(TaskWriteSocket) ); + wait( yield(TaskPriority::WriteSocket) ); } // Wait until there is something to send @@ -602,8 +602,8 @@ TransportData::~TransportData() { } ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader reader, bool inReadSocket) { - int priority = self->endpoints.getPriority(destination.token); - if (priority < TaskReadSocket || !inReadSocket) { + TaskPriority priority = self->endpoints.getPriority(destination.token); + if (priority < TaskPriority::ReadSocket || !inReadSocket) { wait( delay(0, priority) ); } else { g_network->setCurrentTask( priority ); @@ -637,7 +637,7 @@ ACTOR static void deliver(TransportData* self, Endpoint destination, ArenaReader } if( inReadSocket ) - g_network->setCurrentTask( TaskReadSocket ); + g_network->setCurrentTask( TaskPriority::ReadSocket ); } static void scanPackets(TransportData* transport, uint8_t*& unprocessed_begin, const uint8_t* e, Arena& arena, @@ -797,7 +797,7 @@ ACTOR static Future connectionReader( if (len == 0) break; state int readBytes = conn->read(unprocessed_end, unprocessed_end + len); if (readBytes == 0) break; - wait(yield(TaskReadSocket)); + wait(yield(TaskPriority::ReadSocket)); totalReadBytes += readBytes; unprocessed_end += readBytes; } @@ -908,11 +908,11 @@ ACTOR static Future connectionReader( if (readWillBlock) break; - wait(yield(TaskReadSocket)); + wait(yield(TaskPriority::ReadSocket)); } wait( conn->onReadable() ); - wait(delay(0, TaskReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time + wait(delay(0, TaskPriority::ReadSocket)); // We don't want to call conn->read directly from the reactor - we could get stuck in the reactor reading 1 packet at a time } } catch (Error& e) { @@ -956,7 +956,7 @@ ACTOR static Future listen( TransportData* self, NetworkAddress listenAddr .detail("FromAddress", conn->getPeerAddress()) .detail("ListenAddress", listenAddr.toString()); incoming.add( connectionIncoming(self, conn) ); - wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskWriteSocket)); + wait(delay(0) || delay(FLOW_KNOBS->CONNECTION_ACCEPT_DELAY, TaskPriority::WriteSocket)); } } catch (Error& e) { TraceEvent(SevError, "ListenError").error(e); @@ -1078,7 +1078,7 @@ void FlowTransport::removePeerReference( const Endpoint& endpoint, NetworkMessag } } -void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) { +void FlowTransport::addEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) { endpoint.token = deterministicRandom()->randomUniqueID(); if (receiver->isStream()) { endpoint.addresses = self->localAddresses; @@ -1094,7 +1094,7 @@ void FlowTransport::removeEndpoint( const Endpoint& endpoint, NetworkMessageRece self->endpoints.remove(endpoint.token, receiver); } -void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, uint32_t taskID ) { +void FlowTransport::addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver* receiver, TaskPriority taskID ) { endpoint.addresses = self->localAddresses; ASSERT( ((endpoint.token.first() & TOKEN_STREAM_FLAG)!=0) == receiver->isStream() ); Endpoint::Token otoken = endpoint.token; diff --git a/fdbrpc/FlowTransport.h b/fdbrpc/FlowTransport.h index e4e27e598a..5bda279de3 100644 --- a/fdbrpc/FlowTransport.h +++ b/fdbrpc/FlowTransport.h @@ -138,13 +138,13 @@ public: void removePeerReference( const Endpoint&, NetworkMessageReceiver* ); // Signal that a peer connection is no longer being used - void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID ); + void addEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); // Sets endpoint to be a new local endpoint which delivers messages to the given receiver void removeEndpoint( const Endpoint&, NetworkMessageReceiver* ); // The given local endpoint no longer delivers messages to the given receiver or uses resources - void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, uint32_t taskID ); + void addWellKnownEndpoint( Endpoint& endpoint, NetworkMessageReceiver*, TaskPriority taskID ); // Sets endpoint to a new local endpoint (without changing its token) which delivers messages to the given receiver // Implementations may have limitations on when this function is called and what endpoint.token may be! diff --git a/fdbrpc/LoadBalance.actor.h b/fdbrpc/LoadBalance.actor.h index 191b89d93f..7b8c2b2a43 100644 --- a/fdbrpc/LoadBalance.actor.h +++ b/fdbrpc/LoadBalance.actor.h @@ -179,7 +179,7 @@ Future< REPLY_TYPE(Request) > loadBalance( Reference> alternatives, RequestStream Interface::* channel, Request request = Request(), - int taskID = TaskDefaultPromiseEndpoint, + TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, bool atMostOnce = false, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = NULL) { diff --git a/fdbrpc/ReplicationPolicy.h b/fdbrpc/ReplicationPolicy.h index 0f4f350cae..27752271d1 100644 --- a/fdbrpc/ReplicationPolicy.h +++ b/fdbrpc/ReplicationPolicy.h @@ -70,6 +70,13 @@ struct IReplicationPolicy : public ReferenceCounted { return keys; } virtual void attributeKeys(std::set*) const = 0; + + // For flatbuffers, IReplicationPolicy is just encoded as a string using + // |serializeReplicationPolicy|. |writer| is a member of IReplicationPolicy + // so that this string outlives all calls to + // dynamic_size_traits>::save + mutable BinaryWriter writer{ IncludeVersion() }; + mutable bool alreadyWritten = false; }; template @@ -276,12 +283,28 @@ void serializeReplicationPolicy(Ar& ar, Reference& policy) { template <> struct dynamic_size_traits> : std::true_type { - static WriteRawMemory save(const Reference& value) { - BinaryWriter writer(IncludeVersion()); - serializeReplicationPolicy(writer, const_cast&>(value)); - std::unique_ptr memory(new uint8_t[writer.getLength()]); - memcpy(memory.get(), writer.getData(), writer.getLength()); - return std::make_pair, size_t>(ownedPtr(const_cast(memory.release())), writer.getLength()); + static Block save(const Reference& value) { + if (value.getPtr() == nullptr) { + static BinaryWriter writer{ IncludeVersion() }; + writer = BinaryWriter{ IncludeVersion() }; + serializeReplicationPolicy(writer, const_cast&>(value)); + return unownedPtr(const_cast(reinterpret_cast(writer.getData())), + writer.getLength()); + } + if (!value->alreadyWritten) { + serializeReplicationPolicy(value->writer, const_cast&>(value)); + value->alreadyWritten = true; + } + return unownedPtr(const_cast(reinterpret_cast(value->writer.getData())), + value->writer.getLength()); + } + + static void serialization_done(const Reference& value) { + if (value.getPtr() == nullptr) { + return; + } + value->alreadyWritten = false; + value->writer = BinaryWriter{ IncludeVersion() }; } // Context is an arbitrary type that is plumbed by reference throughout the @@ -294,5 +317,6 @@ struct dynamic_size_traits> : std::true_type { } }; +static_assert(detail::has_serialization_done>>::value); #endif diff --git a/fdbrpc/batcher.actor.h b/fdbrpc/batcher.actor.h index 7e276ad574..72a9bc9094 100644 --- a/fdbrpc/batcher.actor.h +++ b/fdbrpc/batcher.actor.h @@ -47,7 +47,7 @@ bool firstInBatch(CommitTransactionRequest x) { } ACTOR template -Future batcher(PromiseStream, int> > out, FutureStream in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, int taskID = TaskDefaultDelay, Counter* counter = 0) +Future batcher(PromiseStream, int> > out, FutureStream in, double avgMinDelay, double* avgMaxDelay, double emptyBatchTimeout, int maxCount, int desiredBytes, int maxBytes, Optional> batchStartedStream, int64_t *commitBatchesMemBytesCount, int64_t commitBatchesMemBytesLimit, TaskPriority taskID = TaskPriority::DefaultDelay, Counter* counter = 0) { wait( delayJittered(*avgMaxDelay, taskID) ); // smooth out // This is set up to deliver even zero-size batches if emptyBatchTimeout elapses, because that's what master proxy wants. The source control history diff --git a/fdbrpc/fdbrpc.h b/fdbrpc/fdbrpc.h index 9853cbe968..75e0a9a551 100644 --- a/fdbrpc/fdbrpc.h +++ b/fdbrpc/fdbrpc.h @@ -48,7 +48,7 @@ struct FlowReceiver : private NetworkMessageReceiver { // If already a remote endpoint, returns that. Otherwise makes this // a local endpoint and returns that. - const Endpoint& getEndpoint(int taskID) { + const Endpoint& getEndpoint(TaskPriority taskID) { if (!endpoint.isValid()) { m_isLocalEndpoint = true; FlowTransport::transport().addEndpoint(endpoint, this, taskID); @@ -56,7 +56,7 @@ struct FlowReceiver : private NetworkMessageReceiver { return endpoint; } - void makeWellKnownEndpoint(Endpoint::Token token, int taskID) { + void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) { ASSERT(!endpoint.isValid()); m_isLocalEndpoint = true; endpoint.token = token; @@ -128,7 +128,7 @@ public: ~ReplyPromise() { if (sav) sav->delPromiseRef(); } ReplyPromise(const Endpoint& endpoint) : sav(new NetSAV(0, 1, endpoint)) {} - const Endpoint& getEndpoint(int taskID = TaskDefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); } + const Endpoint& getEndpoint(TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint) const { return sav->getEndpoint(taskID); } void operator=(const ReplyPromise& rhs) { if (rhs.sav) rhs.sav->addPromiseRef(); @@ -204,19 +204,19 @@ template void resetReply(ReplyPromise & p) { p.reset(); } template -void resetReply(Request& r, int taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); } +void resetReply(Request& r, TaskPriority taskID) { r.reply.reset(); r.reply.getEndpoint(taskID); } template -void resetReply(ReplyPromise & p, int taskID) { p.reset(); p.getEndpoint(taskID); } +void resetReply(ReplyPromise & p, TaskPriority taskID) { p.reset(); p.getEndpoint(taskID); } template -void setReplyPriority(Request& r, int taskID) { r.reply.getEndpoint(taskID); } +void setReplyPriority(Request& r, TaskPriority taskID) { r.reply.getEndpoint(taskID); } template -void setReplyPriority(ReplyPromise & p, int taskID) { p.getEndpoint(taskID); } +void setReplyPriority(ReplyPromise & p, TaskPriority taskID) { p.getEndpoint(taskID); } template -void setReplyPriority(const ReplyPromise & p, int taskID) { p.getEndpoint(taskID); } +void setReplyPriority(const ReplyPromise & p, TaskPriority taskID) { p.getEndpoint(taskID); } @@ -281,7 +281,7 @@ public: return reportEndpointFailure(getReplyPromise(value).getFuture(), getEndpoint()); } template - Future getReply(const X& value, int taskID) const { + Future getReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); return getReply(value); } @@ -290,7 +290,7 @@ public: return getReply(ReplyPromise()); } template - Future getReplyWithTaskID(int taskID) const { + Future getReplyWithTaskID(TaskPriority taskID) const { ReplyPromise reply; reply.getEndpoint(taskID); return getReply(reply); @@ -302,7 +302,7 @@ public: // If cancelled or returns failure, request was or will be delivered zero or one times. // The caller must be capable of retrying if this request returns failure template - Future> tryGetReply(const X& value, int taskID) const { + Future> tryGetReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); if (queue->isRemoteEndpoint()) { Future disc = makeDependent(IFailureMonitor::failureMonitor()).onDisconnectOrFailure(getEndpoint(taskID)); @@ -344,7 +344,7 @@ public: // If it returns failure, the failure detector considers the endpoint failed permanently or for the given amount of time // See IFailureMonitor::onFailedFor() for an explanation of the duration and slope parameters. template - Future> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, int taskID) const { + Future> getReplyUnlessFailedFor(const X& value, double sustainedFailureDuration, double sustainedFailureSlope, TaskPriority taskID) const { // If it is local endpoint, no need for failure monitoring return waitValueOrSignal(getReply(value, taskID), makeDependent(IFailureMonitor::failureMonitor()).onFailedFor(getEndpoint(taskID), sustainedFailureDuration, sustainedFailureSlope), @@ -388,8 +388,8 @@ public: //queue = (NetNotifiedQueue*)0xdeadbeef; } - Endpoint getEndpoint(int taskID = TaskDefaultEndpoint) const { return queue->getEndpoint(taskID); } - void makeWellKnownEndpoint(Endpoint::Token token, int taskID) { + Endpoint getEndpoint(TaskPriority taskID = TaskPriority::DefaultEndpoint) const { return queue->getEndpoint(taskID); } + void makeWellKnownEndpoint(Endpoint::Token token, TaskPriority taskID) { queue->makeWellKnownEndpoint(token, taskID); } @@ -425,7 +425,10 @@ struct serializable_traits> : std::true_type { } else { const auto& ep = stream.getEndpoint(); serializer(ar, ep); - UNSTOPPABLE_ASSERT(ep.getPrimaryAddress().isValid()); // No serializing PromiseStreams on a client with no public address + if constexpr (Archiver::isSerializing) { // Don't assert this when collecting vtable for flatbuffers + UNSTOPPABLE_ASSERT(ep.getPrimaryAddress() + .isValid()); // No serializing PromiseStreams on a client with no public address + } } } }; diff --git a/fdbrpc/genericactors.actor.h b/fdbrpc/genericactors.actor.h index 810ccdb731..744abaeebe 100644 --- a/fdbrpc/genericactors.actor.h +++ b/fdbrpc/genericactors.actor.h @@ -50,7 +50,7 @@ Future retryBrokenPromise( RequestStream to, Req request ) } ACTOR template -Future retryBrokenPromise( RequestStream to, Req request, int taskID ) { +Future retryBrokenPromise( RequestStream to, Req request, TaskPriority taskID ) { // Like to.getReply(request), except that a broken_promise exception results in retrying request immediately. // Suitable for use with well known endpoints, which are likely to return to existence after the other process restarts. // Not normally useful for ordinary endpoints, which conventionally are permanently destroyed after replying with broken_promise. diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index d83babec78..a7ee2623e9 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -423,7 +423,7 @@ public: ACTOR static Future> open( std::string filename, int flags, int mode, Reference diskParameters = Reference(new DiskParameters(25000, 150000000)), bool delayOnWrite = true ) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); if(++openCount >= 3000) { TraceEvent(SevError, "TooManyFiles"); @@ -742,11 +742,11 @@ public: // Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time virtual double now() { return time; } - virtual Future delay( double seconds, int taskID ) { - ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority); + virtual Future delay( double seconds, TaskPriority taskID ) { + ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); return delay( seconds, taskID, currentProcess ); } - Future delay( double seconds, int taskID, ProcessInfo* machine ) { + Future delay( double seconds, TaskPriority taskID, ProcessInfo* machine ) { ASSERT( seconds >= -0.0001 ); seconds = std::max(0.0, seconds); Future f; @@ -761,13 +761,13 @@ public: return f; } - ACTOR static Future checkShutdown(Sim2 *self, int taskID) { + ACTOR static Future checkShutdown(Sim2 *self, TaskPriority taskID) { wait(success(self->getCurrentProcess()->shutdownSignal.getFuture())); self->setCurrentTask(taskID); return Void(); } - virtual Future yield( int taskID ) { - if (taskID == TaskDefaultYield) taskID = currentTaskID; + virtual Future yield( TaskPriority taskID ) { + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (check_yield(taskID)) { // We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but // don't want to prevent instantaneous shutdown of "rebooted" machines. @@ -776,7 +776,7 @@ public: setCurrentTask(taskID); return Void(); } - virtual bool check_yield( int taskID ) { + virtual bool check_yield( TaskPriority taskID ) { if (yielded) return true; if (--yield_limit <= 0) { yield_limit = deterministicRandom()->randomInt(1, 150); // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does @@ -784,10 +784,10 @@ public: } return yielded = BUGGIFY_WITH_PROB(0.01); } - virtual int getCurrentTask() { + virtual TaskPriority getCurrentTask() { return currentTaskID; } - virtual void setCurrentTask(int taskID ) { + virtual void setCurrentTask(TaskPriority taskID ) { currentTaskID = taskID; } // Sets the taskID/priority of the current task, without yielding @@ -924,7 +924,7 @@ public: } if ( mustBeDurable || deterministicRandom()->random01() < 0.5 ) { state ISimulator::ProcessInfo* currentProcess = g_simulator.getCurrentProcess(); - state int currentTaskID = g_network->getCurrentTask(); + state TaskPriority currentTaskID = g_network->getCurrentTask(); wait( g_simulator.onMachine( currentProcess ) ); try { wait( ::delay(0.05 * deterministicRandom()->random01()) ); @@ -950,7 +950,7 @@ public: ACTOR static Future runLoop(Sim2 *self) { state ISimulator::ProcessInfo *callingMachine = self->currentProcess; while ( !self->isStopped ) { - wait( self->net2->yield(TaskDefaultYield) ); + wait( self->net2->yield(TaskPriority::DefaultYield) ); self->mutex.enter(); if( self->tasks.size() == 0 ) { @@ -1580,23 +1580,23 @@ public: machines.erase(machineId); } - Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(-1) { + Sim2(bool objSerializer) : time(0.0), taskCount(0), yielded(false), yield_limit(0), currentTaskID(TaskPriority::Zero) { // Not letting currentProcess be NULL eliminates some annoying special cases currentProcess = new ProcessInfo("NoMachine", LocalityData(Optional>(), StringRef(), StringRef(), StringRef()), ProcessClass(), {NetworkAddress()}, this, "", ""); g_network = net2 = newNet2(false, true, objSerializer); Net2FileSystem::newFileSystem(); - check_yield(0); + check_yield(TaskPriority::Zero); } // Implementation struct Task { - int taskID; + TaskPriority taskID; double time; uint64_t stable; ProcessInfo* machine; Promise action; - Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Promise&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} - Task( double time, int taskID, uint64_t stable, ProcessInfo* machine, Future& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); } + Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Promise&& action ) : time(time), taskID(taskID), stable(stable), machine(machine), action(std::move(action)) {} + Task( double time, TaskPriority taskID, uint64_t stable, ProcessInfo* machine, Future& future ) : time(time), taskID(taskID), stable(stable), machine(machine) { future = action.getFuture(); } Task(Task&& rhs) BOOST_NOEXCEPT : time(rhs.time), taskID(rhs.taskID), stable(rhs.stable), machine(rhs.machine), action(std::move(rhs.action)) {} void operator= ( Task const& rhs ) { taskID = rhs.taskID; time = rhs.time; stable = rhs.stable; machine = rhs.machine; action = rhs.action; } Task( Task const& rhs ) : taskID(rhs.taskID), time(rhs.time), stable(rhs.stable), machine(rhs.machine), action(rhs.action) {} @@ -1643,23 +1643,23 @@ public: } } - virtual void onMainThread( Promise&& signal, int taskID ) { + virtual void onMainThread( Promise&& signal, TaskPriority taskID ) { // This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread // or a thread created with g_network->startThread ASSERT(getCurrentProcess()); mutex.enter(); - ASSERT(taskID >= TaskMinPriority && taskID <= TaskMaxPriority); + ASSERT(taskID >= TaskPriority::Min && taskID <= TaskPriority::Max); tasks.push( Task( time, taskID, taskCount++, getCurrentProcess(), std::move(signal) ) ); mutex.leave(); } bool isOnMainThread() const override { return net2->isOnMainThread(); } - virtual Future onProcess( ISimulator::ProcessInfo *process, int taskID ) { + virtual Future onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID ) { return delay( 0, taskID, process ); } - virtual Future onMachine( ISimulator::ProcessInfo *process, int taskID ) { + virtual Future onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID ) { if( process->machine == 0 ) return Void(); return delay( 0, taskID, process->machine->machineProcess ); @@ -1668,7 +1668,7 @@ public: //time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because //time should only be modified from the main thread. double time; - int currentTaskID; + TaskPriority currentTaskID; //taskCount is guarded by ISimulator::mutex uint64_t taskCount; @@ -1698,9 +1698,9 @@ void startNewSimulator(bool objSerializer) { } ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) { - TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskDefaultDelay", TaskDefaultDelay); + TraceEvent("RebootingProcessAttempt").detail("ZoneId", p->locality.zoneId()).detail("KillType", kt).detail("Process", p->toString()).detail("StartingClass", p->startingClass.toString()).detail("Failed", p->failed).detail("Excluded", p->excluded).detail("Cleared", p->cleared).detail("Rebooting", p->rebooting).detail("TaskPriorityDefaultDelay", TaskPriority::DefaultDelay); - wait( g_sim2.delay( 0, TaskDefaultDelay, p ) ); // Switch to the machine in question + wait( g_sim2.delay( 0, TaskPriority::DefaultDelay, p ) ); // Switch to the machine in question try { ASSERT( kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || kt == ISimulator::RebootProcessAndDelete ); diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index 81e3ecc4f6..403db9ce57 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -137,8 +137,8 @@ public: ProcessInfo* getProcess( Endpoint const& endpoint ) { return getProcessByAddress(endpoint.getPrimaryAddress()); } ProcessInfo* getCurrentProcess() { return currentProcess; } - virtual Future onProcess( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0; - virtual Future onMachine( ISimulator::ProcessInfo *process, int taskID = -1 ) = 0; + virtual Future onProcess( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0; + virtual Future onMachine( ISimulator::ProcessInfo *process, TaskPriority taskID = TaskPriority::Zero ) = 0; virtual ProcessInfo* newProcess(const char* name, IPAddress ip, uint16_t port, uint16_t listenPerProcess, LocalityData locality, ProcessClass startingClass, const char* dataFolder, diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index 9fc12d502e..dec830483d 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -107,7 +107,7 @@ public: DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), clientInfo( new AsyncVar( ClientDBInfo() ) ), serverInfo( new AsyncVar( ServerDBInfo() ) ), - db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskDefaultEndpoint, true ) ) // SOMEDAY: Locality! + db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality! { } @@ -1171,7 +1171,7 @@ public: serverInfo.clusterInterface = ccInterface; serverInfo.myLocality = locality; db.serverInfo->set( serverInfo ); - cx = openDBOnServer(db.serverInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(db.serverInfo, TaskPriority::DefaultEndpoint, true, true); } ~ClusterControllerData() { @@ -1425,7 +1425,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) { rkFitness = ProcessClass::ExcludeFit; } if (self->isProxyOrResolver(rkWorker.details.interf.locality.processId()) || rkFitness > bestFitnessForRK) { - TraceEvent("CC_HaltRK", self->id).detail("RKID", db.ratekeeper.get().id()) + TraceEvent("CCHaltRK", self->id).detail("RKID", db.ratekeeper.get().id()) .detail("Excluded", rkWorker.priorityInfo.isExcluded) .detail("Fitness", rkFitness).detail("BestFitness", bestFitnessForRK); self->recruitRatekeeper.set(true); @@ -1439,7 +1439,7 @@ void checkBetterDDOrRK(ClusterControllerData* self) { ddFitness = ProcessClass::ExcludeFit; } if (self->isProxyOrResolver(ddWorker.details.interf.locality.processId()) || ddFitness > bestFitnessForDD) { - TraceEvent("CC_HaltDD", self->id).detail("DDID", db.distributor.get().id()) + TraceEvent("CCHaltDD", self->id).detail("DDID", db.distributor.get().id()) .detail("Excluded", ddWorker.priorityInfo.isExcluded) .detail("Fitness", ddFitness).detail("BestFitness", bestFitnessForDD); ddWorker.haltDistributor = brokenPromiseToNever(db.distributor.get().haltDataDistributor.getReply(HaltDataDistributorRequest(self->id))); @@ -1920,13 +1920,13 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { self->clusterControllerDcId == req.distributorInterf.get().locality.dcId() && !self->recruitingDistributor) { const DataDistributorInterface& di = req.distributorInterf.get(); - TraceEvent("CC_RegisterDataDistributor", self->id).detail("DDID", di.id()); + TraceEvent("CCRegisterDataDistributor", self->id).detail("DDID", di.id()); self->db.setDistributor(di); } if (req.ratekeeperInterf.present()) { if((self->recruitingRatekeeperID.present() && self->recruitingRatekeeperID.get() != req.ratekeeperInterf.get().id()) || self->clusterControllerDcId != w.locality.dcId()) { - TraceEvent("CC_HaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id()) + TraceEvent("CCHaltRegisteringRatekeeper", self->id).detail("RKID", req.ratekeeperInterf.get().id()) .detail("DcID", printable(self->clusterControllerDcId)) .detail("ReqDcID", printable(w.locality.dcId())) .detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID()); @@ -1934,9 +1934,9 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { } else if(!self->recruitingRatekeeperID.present()) { const RatekeeperInterface& rki = req.ratekeeperInterf.get(); const auto& ratekeeper = self->db.serverInfo->get().ratekeeper; - TraceEvent("CC_RegisterRatekeeper", self->id).detail("RKID", rki.id()); + TraceEvent("CCRegisterRatekeeper", self->id).detail("RKID", rki.id()); if (ratekeeper.present() && ratekeeper.get().id() != rki.id() && self->id_worker.count(ratekeeper.get().locality.processId())) { - TraceEvent("CC_HaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id()) + TraceEvent("CCHaltPreviousRatekeeper", self->id).detail("RKID", ratekeeper.get().id()) .detail("DcID", printable(self->clusterControllerDcId)) .detail("ReqDcID", printable(w.locality.dcId())) .detail("RecruitingRKID", self->recruitingRatekeeperID.present() ? self->recruitingRatekeeperID.get() : UID()); @@ -2475,7 +2475,7 @@ ACTOR Future handleForcedRecoveries( ClusterControllerData *self, ClusterC ACTOR Future startDataDistributor( ClusterControllerData *self ) { wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID. - TraceEvent("CC_StartDataDistributor", self->id); + TraceEvent("CCStartDataDistributor", self->id); loop { try { state bool no_distributor = !self->db.serverInfo->get().distributor.present(); @@ -2494,16 +2494,16 @@ ACTOR Future startDataDistributor( ClusterControllerDa } InitializeDataDistributorRequest req(deterministicRandom()->randomUniqueID()); - TraceEvent("CC_DataDistributorRecruit", self->id).detail("Addr", worker.interf.address()); + TraceEvent("CCDataDistributorRecruit", self->id).detail("Addr", worker.interf.address()); ErrorOr distributor = wait( worker.interf.dataDistributor.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_DISTRIBUTOR_JOIN_DELAY, 0) ); if (distributor.present()) { - TraceEvent("CC_DataDistributorRecruited", self->id).detail("Addr", worker.interf.address()); + TraceEvent("CCDataDistributorRecruited", self->id).detail("Addr", worker.interf.address()); return distributor.get(); } } catch (Error& e) { - TraceEvent("CC_DataDistributorRecruitError", self->id).error(e); + TraceEvent("CCDataDistributorRecruitError", self->id).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } @@ -2520,7 +2520,7 @@ ACTOR Future monitorDataDistributor(ClusterControllerData *self) { loop { if ( self->db.serverInfo->get().distributor.present() ) { wait( waitFailureClient( self->db.serverInfo->get().distributor.get().waitFailure, SERVER_KNOBS->DD_FAILURE_TIME ) ); - TraceEvent("CC_DataDistributorDied", self->id) + TraceEvent("CCDataDistributorDied", self->id) .detail("DistributorId", self->db.serverInfo->get().distributor.get().id()); self->db.clearInterf(ProcessClass::DataDistributorClass); } else { @@ -2535,7 +2535,7 @@ ACTOR Future monitorDataDistributor(ClusterControllerData *self) { ACTOR Future startRatekeeper(ClusterControllerData *self) { wait(delay(0.0)); // If master fails at the same time, give it a chance to clear master PID. - TraceEvent("CC_StartRatekeeper", self->id); + TraceEvent("CCStartRatekeeper", self->id); loop { try { state bool no_ratekeeper = !self->db.serverInfo->get().ratekeeper.present(); @@ -2556,16 +2556,16 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } self->recruitingRatekeeperID = req.reqId; - TraceEvent("CC_RecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId); + TraceEvent("CCRecruitRatekeeper", self->id).detail("Addr", worker.interf.address()).detail("RKID", req.reqId); ErrorOr interf = wait( worker.interf.ratekeeper.getReplyUnlessFailedFor(req, SERVER_KNOBS->WAIT_FOR_RATEKEEPER_JOIN_DELAY, 0) ); if (interf.present()) { self->recruitRatekeeper.set(false); self->recruitingRatekeeperID = interf.get().id(); const auto& ratekeeper = self->db.serverInfo->get().ratekeeper; - TraceEvent("CC_RatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id()); + TraceEvent("CCRatekeeperRecruited", self->id).detail("Addr", worker.interf.address()).detail("RKID", interf.get().id()); if (ratekeeper.present() && ratekeeper.get().id() != interf.get().id() && self->id_worker.count(ratekeeper.get().locality.processId())) { - TraceEvent("CC_HaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id()) + TraceEvent("CCHaltRatekeeperAfterRecruit", self->id).detail("RKID", ratekeeper.get().id()) .detail("DcID", printable(self->clusterControllerDcId)); self->id_worker[ratekeeper.get().locality.processId()].haltRatekeeper = brokenPromiseToNever(ratekeeper.get().haltRatekeeper.getReply(HaltRatekeeperRequest(self->id))); } @@ -2577,7 +2577,7 @@ ACTOR Future startRatekeeper(ClusterControllerData *self) { } } catch (Error& e) { - TraceEvent("CC_RatekeeperRecruitError", self->id).error(e); + TraceEvent("CCRatekeeperRecruitError", self->id).error(e); if ( e.code() != error_code_no_more_servers ) { throw; } @@ -2595,7 +2595,7 @@ ACTOR Future monitorRatekeeper(ClusterControllerData *self) { if ( self->db.serverInfo->get().ratekeeper.present() && !self->recruitRatekeeper.get() ) { choose { when(wait(waitFailureClient( self->db.serverInfo->get().ratekeeper.get().waitFailure, SERVER_KNOBS->RATEKEEPER_FAILURE_TIME ))) { - TraceEvent("CC_RatekeeperDied", self->id) + TraceEvent("CCRatekeeperDied", self->id) .detail("RKID", self->db.serverInfo->get().ratekeeper.get().id()); self->db.clearInterf(ProcessClass::RatekeeperClass); } diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index dc9b41e5a6..d8432c7d1e 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -63,13 +63,13 @@ struct ClusterControllerFullInterface { void initEndpoints() { clientInterface.initEndpoints(); - recruitFromConfiguration.getEndpoint( TaskClusterController ); - recruitRemoteFromConfiguration.getEndpoint( TaskClusterController ); - recruitStorage.getEndpoint( TaskClusterController ); - registerWorker.getEndpoint( TaskClusterController ); - getWorkers.getEndpoint( TaskClusterController ); - registerMaster.getEndpoint( TaskClusterController ); - getServerDBInfo.getEndpoint( TaskClusterController ); + recruitFromConfiguration.getEndpoint( TaskPriority::ClusterController ); + recruitRemoteFromConfiguration.getEndpoint( TaskPriority::ClusterController ); + recruitStorage.getEndpoint( TaskPriority::ClusterController ); + registerWorker.getEndpoint( TaskPriority::ClusterController ); + getWorkers.getEndpoint( TaskPriority::ClusterController ); + registerMaster.getEndpoint( TaskPriority::ClusterController ); + getServerDBInfo.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 4c7ec289f0..e0df69f58b 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -20,8 +20,9 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbserver/IKeyValueStore.h" -#include "flow/ActorCollection.h" #include "fdbserver/Knobs.h" +#include "fdbserver/WorkerInterface.actor.h" +#include "flow/ActorCollection.h" #include "flow/UnitTest.h" #include "flow/IndexedSet.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -52,8 +53,8 @@ GenerationRegInterface::GenerationRegInterface( NetworkAddress remote ) GenerationRegInterface::GenerationRegInterface( INetwork* local ) { - read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskCoordination ); - write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskCoordination ); + read.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_READ, TaskPriority::Coordination ); + write.makeWellKnownEndpoint( WLTOKEN_GENERATIONREG_WRITE, TaskPriority::Coordination ); } LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote) @@ -67,9 +68,9 @@ LeaderElectionRegInterface::LeaderElectionRegInterface(NetworkAddress remote) LeaderElectionRegInterface::LeaderElectionRegInterface(INetwork* local) : ClientLeaderRegInterface(local) { - candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskCoordination ); - leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskCoordination ); - forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskCoordination ); + candidacy.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_CANDIDACY, TaskPriority::Coordination ); + leaderHeartbeat.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_LEADERHEARTBEAT, TaskPriority::Coordination ); + forward.makeWellKnownEndpoint( WLTOKEN_LEADERELECTIONREG_FORWARD, TaskPriority::Coordination ); } ServerCoordinators::ServerCoordinators( Reference cf ) @@ -360,11 +361,11 @@ struct LeaderRegisterCollection { return Void(); } - LeaderElectionRegInterface& getInterface(KeyRef key) { + LeaderElectionRegInterface& getInterface(KeyRef key, UID id) { auto i = registerInterfaces.find( key ); if (i == registerInterfaces.end()) { Key k = key; - Future a = wrap(this, k, leaderRegister(registerInterfaces[k], k) ); + Future a = wrap(this, k, leaderRegister(registerInterfaces[k], k), id); if (a.isError()) throw a.getError(); ASSERT( !a.isReady() ); actors.add( a ); @@ -374,11 +375,15 @@ struct LeaderRegisterCollection { return i->value; } - ACTOR static Future wrap( LeaderRegisterCollection* self, Key key, Future actor ) { + ACTOR static Future wrap( LeaderRegisterCollection* self, Key key, Future actor, UID id ) { state Error e; try { + // FIXME: Get worker ID here + startRole(Role::COORDINATOR, id, UID()); wait(actor); + endRole(Role::COORDINATOR, id, "Coordinator changed"); } catch (Error& err) { + endRole(Role::COORDINATOR, id, err.what(), err.code() == error_code_actor_cancelled, err); if (err.code() == error_code_actor_cancelled) throw; e = err; @@ -392,7 +397,7 @@ struct LeaderRegisterCollection { // leaderServer multiplexes multiple leaderRegisters onto a single LeaderElectionRegInterface, // creating and destroying them on demand. -ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore *pStore) { +ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore *pStore, UID id) { state LeaderRegisterCollection regs( pStore ); state ActorCollection forwarders(false); @@ -404,21 +409,21 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore if( forward.present() ) req.reply.send( forward.get() ); else - regs.getInterface(req.key).getLeader.send( req ); + regs.getInterface(req.key, id).getLeader.send( req ); } when ( CandidacyRequest req = waitNext( interf.candidacy.getFuture() ) ) { Optional forward = regs.getForward(req.key); if( forward.present() ) req.reply.send( forward.get() ); else - regs.getInterface(req.key).candidacy.send(req); + regs.getInterface(req.key, id).candidacy.send(req); } when ( LeaderHeartbeatRequest req = waitNext( interf.leaderHeartbeat.getFuture() ) ) { Optional forward = regs.getForward(req.key); if( forward.present() ) req.reply.send( false ); else - regs.getInterface(req.key).leaderHeartbeat.send(req); + regs.getInterface(req.key, id).leaderHeartbeat.send(req); } when ( ForwardRequest req = waitNext( interf.forward.getFuture() ) ) { Optional forward = regs.getForward(req.key); @@ -426,7 +431,7 @@ ACTOR Future leaderServer(LeaderElectionRegInterface interf, OnDemandStore req.reply.send( Void() ); else { forwarders.add( LeaderRegisterCollection::setForward( ®s, req.key, ClusterConnectionString(req.conn.toString()) ) ); - regs.getInterface(req.key).forward.send(req); + regs.getInterface(req.key, id).forward.send(req); } } when( wait( forwarders.getResult() ) ) { ASSERT(false); throw internal_error(); } @@ -442,7 +447,7 @@ ACTOR Future coordinationServer(std::string dataFolder) { TraceEvent("CoordinationServer", myID).detail("MyInterfaceAddr", myInterface.read.getEndpoint().getPrimaryAddress()).detail("Folder", dataFolder); try { - wait( localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store) || store.getError() ); + wait( localGenerationReg(myInterface, &store) || leaderServer(myLeaderInterface, &store, myID) || store.getError() ); throw internal_error(); } catch (Error& e) { TraceEvent("CoordinationServerError", myID).error(e, true); diff --git a/fdbserver/CoroFlow.actor.cpp b/fdbserver/CoroFlow.actor.cpp index af9b5ac565..22eaab2b0f 100644 --- a/fdbserver/CoroFlow.actor.cpp +++ b/fdbserver/CoroFlow.actor.cpp @@ -263,7 +263,7 @@ typedef WorkPool CoroPool; -ACTOR void coroSwitcher( Future what, int taskID, Coro* coro ) { +ACTOR void coroSwitcher( Future what, TaskPriority taskID, Coro* coro ) { try { // state double t = now(); wait(what); diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 3f2ba3da3a..6209723697 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -88,7 +88,7 @@ struct TCMachineInfo : public ReferenceCounted { ACTOR Future updateServerMetrics( TCServerInfo *server ) { state StorageServerInterface ssi = server->lastKnownInterface; - state Future> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch ); + state Future> metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch ); state Future resetRequest = Never(); state Future> interfaceChanged( server->onInterfaceChanged ); state Future serverRemoved( server->onRemoved ); @@ -104,7 +104,7 @@ ACTOR Future updateServerMetrics( TCServerInfo *server ) { return Void(); } metricsRequest = Never(); - resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskDataDistributionLaunch ); + resetRequest = delay( SERVER_KNOBS->METRIC_DELAY, TaskPriority::DataDistributionLaunch ); } when( std::pair _ssi = wait( interfaceChanged ) ) { ssi = _ssi.first; @@ -120,7 +120,7 @@ ACTOR Future updateServerMetrics( TCServerInfo *server ) { } else { resetRequest = Never(); - metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskDataDistributionLaunch ); + metricsRequest = ssi.getPhysicalMetrics.tryGetReply( GetPhysicalMetricsRequest(), TaskPriority::DataDistributionLaunch ); } } } @@ -636,9 +636,9 @@ struct DDTeamCollection : ReferenceCounted { shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder(Void()), badTeamRemover(Void()), redundantMachineTeamRemover(Void()), redundantServerTeamRemover(Void()), configuration(configuration), readyToStart(readyToStart), clearHealthyZoneFuture(Void()), - checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution)), + checkTeamDelay(delay(SERVER_KNOBS->CHECK_TEAM_DELAY, TaskPriority::DataDistribution)), initialFailureReactionDelay( - delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution)), + delayed(readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskPriority::DataDistribution)), healthyTeamCount(0), storageServerSet(new LocalityMap()), initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount(0), recruitingStream(0), restartRecruiting(SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY), @@ -672,7 +672,7 @@ struct DDTeamCollection : ReferenceCounted { ACTOR static Future logOnCompletion( Future signal, DDTeamCollection* self ) { wait(signal); - wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskDataDistribution)); + wait(delay(SERVER_KNOBS->LOG_ON_COMPLETION_DELAY, TaskPriority::DataDistribution)); if(!self->primary || self->configuration.usableRegions == 1) { TraceEvent("DDTrackerStarting", self->distributorId) @@ -1310,7 +1310,7 @@ struct DDTeamCollection : ReferenceCounted { // Five steps to create each machine team, which are document in the function // Reuse ReplicationPolicy selectReplicas func to select machine team // return number of added machine teams - int addBestMachineTeams(int targetMachineTeamsToBuild) { + int addBestMachineTeams(int targetMachineTeamsToBuild, int remainingMachineTeamBudget) { int addedMachineTeams = 0; int machineTeamsToBuild = 0; @@ -1328,7 +1328,7 @@ struct DDTeamCollection : ReferenceCounted { int loopCount = 0; // Add a team in each iteration - while (addedMachineTeams < machineTeamsToBuild) { + while (addedMachineTeams < machineTeamsToBuild || addedMachineTeams < remainingMachineTeamBudget) { // Step 2: Get least used machines from which we choose machines as a machine team std::vector> leastUsedMachines; // A less used machine has less number of teams int minTeamCount = std::numeric_limits::max(); @@ -1378,6 +1378,8 @@ struct DDTeamCollection : ReferenceCounted { // that have the least-utilized server team.clear(); auto success = machineLocalityMap.selectReplicas(configuration.storagePolicy, forcedAttributes, team); + // NOTE: selectReplicas() should always return success when storageTeamSize = 1 + ASSERT_WE_THINK(configuration.storageTeamSize > 1 || (configuration.storageTeamSize == 1 && success)); if (!success) { break; } @@ -1431,6 +1433,9 @@ struct DDTeamCollection : ReferenceCounted { addMachineTeam(machines); addedMachineTeams++; + // Update the remaining machine team budget because the budget may decrease by + // any value between 1 and storageTeamSize + remainingMachineTeamBudget = getRemainingMachineTeamBudget(); } else { TraceEvent(SevWarn, "DataDistributionBuildTeams", distributorId) .detail("Primary", primary) @@ -1590,6 +1595,32 @@ struct DDTeamCollection : ReferenceCounted { return totalHealthyMachineCount; } + std::pair calculateMinMaxServerTeamNumOnServer() { + int64_t minTeamNumber = std::numeric_limits::max(); + int64_t maxTeamNumber = 0; + for (auto& server : server_info) { + if (server_status.get(server.first).isUnhealthy()) { + continue; + } + minTeamNumber = std::min((int64_t) server.second->teams.size(), minTeamNumber); + maxTeamNumber = std::max((int64_t) server.second->teams.size(), maxTeamNumber); + } + return std::make_pair(minTeamNumber, maxTeamNumber); + } + + std::pair calculateMinMaxMachineTeamNumOnMachine() { + int64_t minTeamNumber = std::numeric_limits::max(); + int64_t maxTeamNumber = 0; + for (auto& machine : machine_info) { + if (!isMachineHealthy(machine.second)) { + continue; + } + minTeamNumber = std::min((int64_t) machine.second->machineTeams.size(), minTeamNumber); + maxTeamNumber = std::max((int64_t) machine.second->machineTeams.size(), maxTeamNumber); + } + return std::make_pair(minTeamNumber, maxTeamNumber); + } + // Sanity check bool isServerTeamNumberCorrect(Reference& mt) { int num = 0; @@ -1662,12 +1693,41 @@ struct DDTeamCollection : ReferenceCounted { return healthyTeamCount; } + // Each machine is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + // remainingMachineTeamBudget is the number of machine teams needed to ensure every machine has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int getRemainingMachineTeamBudget() { + int remainingMachineTeamBudget = 0; + for (auto& m : machine_info) { + int machineTeamCount = m.second->machineTeams.size(); + remainingMachineTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - machineTeamCount)); + } + + // We over-provision the remainingMachineTeamBudget because we do not know, when a new machine team is built, + // how many times it can be counted into the budget. For example, when a new machine is added, + // a new machine team only consume 1 such budget + return remainingMachineTeamBudget; + } + + // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + int getRemainingServerTeamBudget() { + // remainingTeamBudget is the number of teams needed to ensure every server has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int remainingTeamBudget = 0; + for (auto& s : server_info) { + int numValidTeams = s.second->teams.size(); + remainingTeamBudget += std::max(0, (int)(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER - numValidTeams)); + } + + return remainingTeamBudget; + } + // Create server teams based on machine teams // Before the number of machine teams reaches the threshold, build a machine team for each server team // When it reaches the threshold, first try to build a server team with existing machine teams; if failed, // build an extra machine team and record the event in trace - int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber) { - ASSERT(teamsToBuild > 0); + int addTeamsBestOf(int teamsToBuild, int desiredTeamNumber, int maxTeamNumber, int remainingTeamBudget) { + ASSERT(teamsToBuild >= 0); ASSERT_WE_THINK(machine_info.size() > 0 || server_info.size() == 0); int addedMachineTeams = 0; @@ -1678,27 +1738,28 @@ struct DDTeamCollection : ReferenceCounted { // When we change configuration, we may have machine teams with storageTeamSize in the old configuration. int healthyMachineTeamCount = getHealthyMachineTeamCount(); int totalMachineTeamCount = machineTeams.size(); - int totalHealthyMachineCount = calculateHealthyMachineCount(); + int remainingMachineTeamBudget = getRemainingMachineTeamBudget(); int desiredMachineTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * totalHealthyMachineCount; int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; // machineTeamsToBuild mimics how the teamsToBuild is calculated in buildTeams() - int machineTeamsToBuild = - std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount); + int machineTeamsToBuild = std::max( + 0, std::min(desiredMachineTeams - healthyMachineTeamCount, maxMachineTeams - totalMachineTeamCount)); TraceEvent("BuildMachineTeams") .detail("TotalHealthyMachine", totalHealthyMachineCount) .detail("HealthyMachineTeamCount", healthyMachineTeamCount) .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) - .detail("MachineTeamsToBuild", machineTeamsToBuild); + .detail("MachineTeamsToBuild", machineTeamsToBuild) + .detail("RemainingMachineTeamBudget", remainingMachineTeamBudget); // Pre-build all machine teams until we have the desired number of machine teams - if (machineTeamsToBuild > 0) { - addedMachineTeams = addBestMachineTeams(machineTeamsToBuild); + if (machineTeamsToBuild > 0 || remainingMachineTeamBudget > 0) { + addedMachineTeams = addBestMachineTeams(machineTeamsToBuild, remainingMachineTeamBudget); } - while (addedTeams < teamsToBuild) { + while (addedTeams < teamsToBuild || addedTeams < remainingTeamBudget) { // Step 1: Create 1 best machine team std::vector bestServerTeam; int bestScore = std::numeric_limits::max(); @@ -1775,6 +1836,7 @@ struct DDTeamCollection : ReferenceCounted { // Step 4: Add the server team addTeam(bestServerTeam.begin(), bestServerTeam.end(), false); addedTeams++; + remainingTeamBudget = getRemainingServerTeamBudget(); if (++loopCount > 2 * teamsToBuild * (configuration.storageTeamSize + 1)) { break; @@ -1783,10 +1845,14 @@ struct DDTeamCollection : ReferenceCounted { healthyMachineTeamCount = getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) .detail("AddedTeamNumber", addedTeams) .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("RemainingTeamBudget", remainingTeamBudget) .detail("CurrentTeamNumber", teams.size()) .detail("DesiredTeamNumber", desiredTeamNumber) .detail("MaxTeamNumber", maxTeamNumber) @@ -1796,6 +1862,11 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); return addedTeams; @@ -1812,10 +1883,14 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", distributorId) .detail("Primary", primary) .detail("AddedTeamNumber", 0) .detail("AimToBuildTeamNumber", 0) + .detail("RemainingTeamBudget", 0) .detail("CurrentTeamNumber", teams.size()) .detail("DesiredTeamNumber", desiredServerTeams) .detail("MaxTeamNumber", maxServerTeams) @@ -1825,14 +1900,22 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", doBuildTeams) .trackLatest("TeamCollectionInfo"); - // Debug purpose -// if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) { -// // When the number of machine teams is over the limit, print out the current team info. -// traceAllInfo(true); -// } + // Advance time so that we will not have multiple TeamCollectionInfo at the same time, otherwise + // simulation test will randomly pick one TeamCollectionInfo trace, which could be the one before build teams + // wait(delay(0.01)); + // Debug purpose + // if (healthyMachineTeamCount > desiredMachineTeams || machineTeams.size() > maxMachineTeams) { + // // When the number of machine teams is over the limit, print out the current team info. + // traceAllInfo(true); + // } } // Use the current set of known processes (from server_info) to compute an optimized set of storage server teams. @@ -1888,10 +1971,14 @@ struct DDTeamCollection : ReferenceCounted { totalTeamCount++; } } + // Each server is expected to have SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER, + // remainingTeamBudget is the number of teams needed to ensure every server has + // SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER teams + int remainingTeamBudget = self->getRemainingServerTeamBudget(); // teamsToBuild is calculated such that we will not build too many teams in the situation // when all (or most of) teams become unhealthy temporarily and then healthy again - state int teamsToBuild = std::min(desiredTeams - teamCount, maxTeams - totalTeamCount); + state int teamsToBuild = std::max(0, std::min(desiredTeams - teamCount, maxTeams - totalTeamCount)); TraceEvent("BuildTeamsBegin", self->distributorId) .detail("TeamsToBuild", teamsToBuild) @@ -1908,13 +1995,13 @@ struct DDTeamCollection : ReferenceCounted { .detail("MachineCount", self->machine_info.size()) .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER); - if (teamsToBuild > 0) { + if (teamsToBuild > 0 || remainingTeamBudget > 0) { state vector> builtTeams; // addTeamsBestOf() will not add more teams than needed. // If the team number is more than the desired, the extra teams are added in the code path when // a team is added as an initial team - int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams); + int addedTeams = self->addTeamsBestOf(teamsToBuild, desiredTeams, maxTeams, remainingTeamBudget); if (addedTeams <= 0 && self->teams.size() == 0) { TraceEvent(SevWarn, "NoTeamAfterBuildTeam") @@ -1930,10 +2017,14 @@ struct DDTeamCollection : ReferenceCounted { int maxMachineTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * totalHealthyMachineCount; int healthyMachineTeamCount = self->getHealthyMachineTeamCount(); + std::pair minMaxTeamNumberOnServer = self->calculateMinMaxServerTeamNumOnServer(); + std::pair minMaxMachineTeamNumberOnMachine = self->calculateMinMaxMachineTeamNumOnMachine(); + TraceEvent("TeamCollectionInfo", self->distributorId) .detail("Primary", self->primary) .detail("AddedTeamNumber", 0) .detail("AimToBuildTeamNumber", teamsToBuild) + .detail("RemainingTeamBudget", remainingTeamBudget) .detail("CurrentTeamNumber", self->teams.size()) .detail("DesiredTeamNumber", desiredTeams) .detail("MaxTeamNumber", maxTeams) @@ -1943,6 +2034,11 @@ struct DDTeamCollection : ReferenceCounted { .detail("DesiredMachineTeams", desiredMachineTeams) .detail("MaxMachineTeams", maxMachineTeams) .detail("TotalHealthyMachine", totalHealthyMachineCount) + .detail("MinTeamNumberOnServer", minMaxTeamNumberOnServer.first) + .detail("MaxTeamNumberOnServer", minMaxTeamNumberOnServer.second) + .detail("MinMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.first) + .detail("MaxMachineTeamNumberOnMachine", minMaxMachineTeamNumberOnMachine.second) + .detail("DoBuildTeams", self->doBuildTeams) .trackLatest("TeamCollectionInfo"); } } @@ -1951,7 +2047,7 @@ struct DDTeamCollection : ReferenceCounted { //Building teams can cause servers to become undesired, which can make teams unhealthy. //Let all of these changes get worked out before responding to the get team request - wait( delay(0, TaskDataDistributionLaunch) ); + wait( delay(0, TaskPriority::DataDistributionLaunch) ); return Void(); } @@ -2264,7 +2360,7 @@ ACTOR Future waitUntilHealthy(DDTeamCollection* self) { TraceEvent("WaitUntilHealthyStalled", self->distributorId).detail("Primary", self->primary).detail("ZeroHealthy", self->zeroHealthyTeams->get()).detail("ProcessingUnhealthy", self->processingUnhealthy->get()); wait(self->zeroHealthyTeams->onChange() || self->processingUnhealthy->onChange()); } - wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskLowPriority)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. + wait(delay(SERVER_KNOBS->DD_STALL_CHECK_DELAY, TaskPriority::Low)); //After the team trackers wait on the initial failure reaction delay, they yield. We want to make sure every tracker has had the opportunity to send their relocations to the queue. if(!self->zeroHealthyTeams->get() && !self->processingUnhealthy->get()) { return Void(); } @@ -2340,6 +2436,16 @@ ACTOR Future machineTeamRemover(DDTeamCollection* self) { team = mt->serverTeams[teamIndex]; ASSERT(team->machineTeam->machineIDs == mt->machineIDs); // Sanity check + // Check if a server will have 0 team after the team is removed + for (auto& s : team->getServers()) { + if (s->teams.size() == 0) { + TraceEvent(SevError, "TeamRemoverTooAggressive") + .detail("Server", s->id) + .detail("Team", team->getServerIDsStr()); + self->traceAllInfo(true); + } + } + // The team will be marked as a bad team bool foundTeam = self->removeTeam(team); ASSERT(foundTeam == true); @@ -2649,7 +2755,12 @@ ACTOR Future teamTracker(DDTeamCollection* self, Reference tea //If we cannot find the team, it could be a bad team so assume unhealthy priority if(!found) { - maxPriority = std::max( maxPriority, PRIORITY_TEAM_UNHEALTHY ); + // If the input team (in function parameters) is a redundant team, found will be + // false We want to differentiate the redundant_team from unhealthy_team in + // terms of relocate priority + maxPriority = + std::max(maxPriority, redundantTeam ? PRIORITY_TEAM_REDUNDANT + : PRIORITY_TEAM_UNHEALTHY); } } else { TEST(true); // A removed server is still associated with a team in SABTF @@ -2747,7 +2858,7 @@ ACTOR Future trackExcludedServers( DDTeamCollection* self ) { if (nchid != lastChangeID) break; - wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskDataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead + wait( delay( SERVER_KNOBS->SERVER_LIST_DELAY, TaskPriority::DataDistribution ) ); // FIXME: make this tr.watch( excludedServersVersionKey ) instead tr = Transaction(self->cx); } catch (Error& e) { wait( tr.onError(e) ); @@ -2843,12 +2954,18 @@ ACTOR Future waitHealthyZoneChange( DDTeamCollection* self ) { if(val.present()) { auto p = decodeHealthyZoneValue(val.get()); if(p.second > tr.getReadVersion().get()) { - healthyZoneTimeout = delay((p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND); - self->healthyZone.set(p.first); - } else { + double timeoutSeconds = (p.second - tr.getReadVersion().get())/(double)SERVER_KNOBS->VERSIONS_PER_SECOND; + healthyZoneTimeout = delay(timeoutSeconds); + if(self->healthyZone.get() != p.first) { + TraceEvent("MaintenanceZoneStart", self->distributorId).detail("ZoneID", printable(p.first)).detail("EndVersion", p.second).detail("Duration", timeoutSeconds); + self->healthyZone.set(p.first); + } + } else if(self->healthyZone.get().present()) { + TraceEvent("MaintenanceZoneEnd", self->distributorId); self->healthyZone.set(Optional()); } - } else { + } else if(self->healthyZone.get().present()) { + TraceEvent("MaintenanceZoneEnd", self->distributorId); self->healthyZone.set(Optional()); } @@ -2866,14 +2983,14 @@ ACTOR Future serverMetricsPolling( TCServerInfo *server) { state double lastUpdate = now(); loop { wait( updateServerMetrics( server ) ); - wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskDataDistributionLaunch ) ); + wait( delayUntil( lastUpdate + SERVER_KNOBS->STORAGE_METRICS_POLLING_DELAY + SERVER_KNOBS->STORAGE_METRICS_RANDOM_DELAY * deterministicRandom()->random01(), TaskPriority::DataDistributionLaunch ) ); lastUpdate = now(); } } //Returns the KeyValueStoreType of server if it is different from self->storeType ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) { - state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID(TaskDataDistribution))); + state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID(TaskPriority::DataDistribution))); if(type == self->configuration.storageServerStoreType && (self->includedDCs.empty() || std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) != self->includedDCs.end()) ) wait(Future(Never())); @@ -2896,7 +3013,7 @@ ACTOR Future waitForAllDataRemoved( Database cx, UID serverID, Version add } // Wait for any change to the serverKeys for this server - wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskDataDistribution) ); + wait( delay(SERVER_KNOBS->ALL_DATA_REMOVED_DELAY, TaskPriority::DataDistribution) ); tr.reset(); } catch (Error& e) { wait( tr.onError(e) ); @@ -2939,7 +3056,7 @@ ACTOR Future storageServerFailureTracker( ASSERT(!inHealthyZone); healthChanged = IFailureMonitor::failureMonitor().onStateEqual( interf.waitFailure.getEndpoint(), FailureStatus(false)); } else if(!inHealthyZone) { - healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskDataDistribution); + healthChanged = waitFailureClientStrict(interf.waitFailure, SERVER_KNOBS->DATA_DISTRIBUTION_FAILURE_REACTION_TIME, TaskPriority::DataDistribution); } choose { when ( wait(healthChanged) ) { @@ -2949,6 +3066,7 @@ ACTOR Future storageServerFailureTracker( } if(status->isFailed && self->healthyZone.get().present() && self->clearHealthyZoneFuture.isReady()) { self->clearHealthyZoneFuture = clearHealthyZone(self->cx); + TraceEvent("MaintenanceZoneCleared", self->distributorId); self->healthyZone.set(Optional()); } @@ -3062,11 +3180,14 @@ ACTOR Future storageServerTracker( if(hasWrongStoreTypeOrDC) self->restartRecruiting.trigger(); - if ( lastIsUnhealthy && !status.isUnhealthy() && !server->teams.size() ) { + if (lastIsUnhealthy && !status.isUnhealthy() && + server->teams.size() < SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) { self->doBuildTeams = true; + self->restartTeamBuilder.trigger(); // This does not trigger building teams if there exist healthy teams } lastIsUnhealthy = status.isUnhealthy(); + state bool recordTeamCollectionInfo = false; choose { when( wait( failureTracker ) ) { // The server is failed AND all data has been removed from it, so permanently remove it. @@ -3170,7 +3291,8 @@ ACTOR Future storageServerTracker( self->badTeamRemover = removeBadTeams(self); self->addActor.send(self->badTeamRemover); // The team number changes, so we need to update the team number info - self->traceTeamCollectionInfo(); + // self->traceTeamCollectionInfo(); + recordTeamCollectionInfo = true; } } @@ -3178,10 +3300,13 @@ ACTOR Future storageServerTracker( // We rely on the old failureTracker being actorCancelled since the old actor now has a pointer to an invalid location status = ServerStatus( status.isFailed, status.isUndesired, server->lastKnownInterface.locality ); + // self->traceTeamCollectionInfo(); + recordTeamCollectionInfo = true; //Restart the storeTracker for the new interface storeTracker = keyValueStoreTypeTracker(self, server); hasWrongStoreTypeOrDC = false; self->restartTeamBuilder.trigger(); + if(restartRecruiting) self->restartRecruiting.trigger(); } @@ -3202,6 +3327,10 @@ ACTOR Future storageServerTracker( server->wakeUpTracker = Promise(); } } + + if (recordTeamCollectionInfo) { + self->traceTeamCollectionInfo(); + } } } catch( Error &e ) { if (e.code() != error_code_actor_cancelled && errorOut.canBeSet()) @@ -3229,7 +3358,7 @@ ACTOR Future monitorStorageServerRecruitment(DDTeamCollection* self) { loop { choose { when( wait( self->recruitingStream.onChange() ) ) {} - when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskDataDistribution) : Future(Never()) ) ) { break; } + when( wait( self->recruitingStream.get() == 0 ? delay(SERVER_KNOBS->RECRUITMENT_IDLE_DELAY, TaskPriority::DataDistribution) : Future(Never()) ) ) { break; } } } TraceEvent("StorageServerRecruitment", self->distributorId) @@ -3256,12 +3385,12 @@ ACTOR Future initializeStorage( DDTeamCollection* self, RecruitStorageRepl self->recruitingIds.insert(interfaceId); self->recruitingLocalities.insert(candidateWorker.worker.address()); - state ErrorOr newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskDataDistribution ) ); + state ErrorOr newServer = wait( candidateWorker.worker.storage.tryGetReply( isr, TaskPriority::DataDistribution ) ); if(newServer.isError()) { TraceEvent(SevWarn, "DDRecruitmentError").error(newServer.getError()); if( !newServer.isError( error_code_recruitment_failed ) && !newServer.isError( error_code_request_maybe_delivered ) ) throw newServer.getError(); - wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskDataDistribution) ); + wait( delay(SERVER_KNOBS->STORAGE_RECRUITMENT_DELAY, TaskPriority::DataDistribution) ); } self->recruitingIds.erase(interfaceId); self->recruitingLocalities.erase(candidateWorker.worker.address()); @@ -3326,7 +3455,7 @@ ACTOR Future storageRecruiter( DDTeamCollection* self, Referenceget().clusterInterface.recruitStorage.getReply( rsr, TaskDataDistribution ) ); + fCandidateWorker = brokenPromiseToNever( db->get().clusterInterface.recruitStorage.getReply( rsr, TaskPriority::DataDistribution ) ); } choose { @@ -3501,7 +3630,7 @@ ACTOR Future dataDistributionTeamCollection( ACTOR Future waitForDataDistributionEnabled( Database cx ) { state Transaction tr(cx); loop { - wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskDataDistribution)); + wait(delay(SERVER_KNOBS->DD_ENABLED_CHECK_DELAY, TaskPriority::DataDistribution)); try { Optional mode = wait( tr.get( dataDistributionModeKey ) ); @@ -3629,7 +3758,7 @@ ACTOR Future dataDistribution(Reference self) state double lastLimited = 0; self->addActor.send( monitorBatchLimitedTime(self->dbInfo, &lastLimited) ); - state Database cx = openDBOnServer(self->dbInfo, TaskDataDistributionLaunch, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DataDistributionLaunch, true, true); cx->locationCacheSize = SERVER_KNOBS->DD_LOCATION_CACHE_SIZE; //cx->setOption( FDBDatabaseOptions::LOCATION_CACHE_SIZE, StringRef((uint8_t*) &SERVER_KNOBS->DD_LOCATION_CACHE_SIZE, 8) ); @@ -3759,7 +3888,7 @@ ACTOR Future dataDistribution(Reference self) } output.send( RelocateShard( keys, unhealthy ? PRIORITY_TEAM_UNHEALTHY : PRIORITY_RECOVER_MOVE ) ); } - wait( yield(TaskDataDistribution) ); + wait( yield(TaskPriority::DataDistribution) ); } vector tcis; @@ -3831,7 +3960,7 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference collection = actorCollection( self->addActor.getFuture() ); try { - TraceEvent("DataDistributor_Running", di.id()); + TraceEvent("DataDistributorRunning", di.id()); self->addActor.send( waitFailureServer(di.waitFailure.getFuture()) ); state Future distributor = reportErrorsExcept( dataDistribution(self), "DataDistribution", di.id(), &normalDataDistributorErrors() ); @@ -3849,10 +3978,10 @@ ACTOR Future dataDistributor(DataDistributorInterface di, Reference policy = Reference(new PolicyAcross(teamSize, "zoneid", Reference(new PolicyOne()))); state DDTeamCollection* collection = testMachineTeamCollection(teamSize, policy, processSize); - collection->addTeamsBestOf(30, desiredTeams, maxTeams); + collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30); ASSERT(collection->sanityCheckTeams() == true); @@ -3980,8 +4109,8 @@ TEST_CASE("DataDistribution/AddTeamsBestOf/NotUseMachineID") { return Void(); } - collection->addBestMachineTeams(30); // Create machine teams to help debug - collection->addTeamsBestOf(30, desiredTeams, maxTeams); + collection->addBestMachineTeams(30, 30); // Create machine teams to help debug + collection->addTeamsBestOf(30, desiredTeams, maxTeams, 30); collection->sanityCheckTeams(); // Server team may happen to be on the same machine team, although unlikely if (collection) delete (collection); @@ -3996,7 +4125,7 @@ TEST_CASE("DataDistribution/AddAllTeams/isExhaustive") { state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; state DDTeamCollection* collection = testTeamCollection(3, policy, processSize); - int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(200, desiredTeams, maxTeams, 200); delete(collection); @@ -4016,11 +4145,11 @@ TEST_CASE("/DataDistribution/AddAllTeams/withLimit") { state DDTeamCollection* collection = testTeamCollection(3, policy, processSize); - int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10); delete(collection); - ASSERT(result == 10); + ASSERT(result >= 10); return Void(); } @@ -4036,9 +4165,9 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/SkippingBusyServers") { collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); - int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams); + int result = collection->addTeamsBestOf(8, desiredTeams, maxTeams, 8); - ASSERT(result == 8); + ASSERT(result >= 8); for(auto process = collection->server_info.begin(); process != collection->server_info.end(); process++) { auto teamCount = process->second->teams.size(); @@ -4066,8 +4195,8 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); collection->addTeam(std::set({ UID(1, 0), UID(3, 0), UID(4, 0) }), true); - collection->addBestMachineTeams(10); - int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams); + collection->addBestMachineTeams(10, 10); + int result = collection->addTeamsBestOf(10, desiredTeams, maxTeams, 10); if (collection->machineTeams.size() != 10 || result != 8) { collection->traceAllInfo(true); // Debug message diff --git a/fdbserver/DataDistributionQueue.actor.cpp b/fdbserver/DataDistributionQueue.actor.cpp index e155254850..d11fc63146 100644 --- a/fdbserver/DataDistributionQueue.actor.cpp +++ b/fdbserver/DataDistributionQueue.actor.cpp @@ -512,9 +512,9 @@ struct DDQueueData { // FIXME: is the merge case needed if( input.priority == PRIORITY_MERGE_SHARD ) { - wait( delay( 0.5, TaskDataDistribution - 2 ) ); + wait( delay( 0.5, decrementPriority(decrementPriority(TaskPriority::DataDistribution )) ) ); } else { - wait( delay( 0.0001, TaskDataDistributionLaunch ) ); + wait( delay( 0.0001, TaskPriority::DataDistributionLaunch ) ); } loop { @@ -933,7 +933,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd .detail("Count", stuckCount) .detail("TeamCollectionId", tciIndex) .detail("NumOfTeamCollections", self->teamCollections.size()); - wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskDataDistributionLaunch ) ); + wait( delay( SERVER_KNOBS->BEST_TEAM_STUCK_DELAY, TaskPriority::DataDistributionLaunch ) ); } state std::vector destIds; @@ -993,7 +993,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd state Error error = success(); state Promise dataMovementComplete; state Future doMoveKeys = moveKeys(self->cx, rd.keys, destIds, healthyIds, self->lock, dataMovementComplete, &self->startMoveKeysParallelismLock, &self->finishMoveKeysParallelismLock, self->teamCollections.size() > 1, relocateShardInterval.pairID ); - state Future pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch ); + state Future pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch ); try { loop { choose { @@ -1016,7 +1016,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd self->dataTransferComplete.send(rd); } } - pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskDataDistributionLaunch ); + pollHealth = signalledTransferComplete ? Never() : delay( SERVER_KNOBS->HEALTH_POLL_TIME, TaskPriority::DataDistributionLaunch ); } when( wait( signalledTransferComplete ? Never() : dataMovementComplete.getFuture() ) ) { self->fetchKeysComplete.insert( rd ); @@ -1066,7 +1066,7 @@ ACTOR Future dataDistributionRelocator( DDQueueData *self, RelocateData rd } else { TEST(true); // move to removed server healthyDestinations.addDataInFlightToTeam( -metrics.bytes ); - wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskDataDistributionLaunch ) ); + wait( delay( SERVER_KNOBS->RETRY_RELOCATESHARD_DELAY, TaskPriority::DataDistributionLaunch ) ); } } } catch (Error& e) { @@ -1125,7 +1125,7 @@ ACTOR Future BgDDMountainChopper( DDQueueData* self, int teamCollectionInd state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; loop { - wait( delay(checkDelay, TaskDataDistributionLaunch) ); + wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) ); if (self->priority_relocations[PRIORITY_REBALANCE_OVERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { state Optional> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, true ) ) ) ); if( randomTeam.present() ) { @@ -1160,7 +1160,7 @@ ACTOR Future BgDDValleyFiller( DDQueueData* self, int teamCollectionIndex) state double checkDelay = SERVER_KNOBS->BG_DD_POLLING_INTERVAL; state int resetCount = SERVER_KNOBS->DD_REBALANCE_RESET_AMOUNT; loop { - wait( delay(checkDelay, TaskDataDistributionLaunch) ); + wait( delay(checkDelay, TaskPriority::DataDistributionLaunch) ); if (self->priority_relocations[PRIORITY_REBALANCE_UNDERUTILIZED_TEAM] < SERVER_KNOBS->DD_REBALANCE_PARALLELISM) { state Optional> randomTeam = wait( brokenPromiseToNever( self->teamCollections[teamCollectionIndex].getTeam.getReply( GetTeamRequest( true, false, false ) ) ) ); if( randomTeam.present() ) { @@ -1244,7 +1244,7 @@ ACTOR Future dataDistributionQueue( bool wasEmpty = serversToLaunchFrom.empty(); self.queueRelocation( rs, serversToLaunchFrom ); if(wasEmpty && !serversToLaunchFrom.empty()) - launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch); + launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch); } when ( wait(launchQueuedWorkTimeout) ) { self.launchQueuedWork( serversToLaunchFrom ); @@ -1258,7 +1258,7 @@ ACTOR Future dataDistributionQueue( when ( RelocateData done = waitNext( self.dataTransferComplete.getFuture() ) ) { complete( done, self.busymap ); if(serversToLaunchFrom.empty() && !done.src.empty()) - launchQueuedWorkTimeout = delay(0, TaskDataDistributionLaunch); + launchQueuedWorkTimeout = delay(0, TaskPriority::DataDistributionLaunch); serversToLaunchFrom.insert(done.src.begin(), done.src.end()); } when ( RelocateData done = waitNext( self.relocationComplete.getFuture() ) ) { @@ -1266,7 +1266,7 @@ ACTOR Future dataDistributionQueue( self.finishRelocation(done.priority); self.fetchKeysComplete.erase( done ); //self.logRelocation( done, "ShardRelocatorDone" ); - actors.add( tag( delay(0, TaskDataDistributionLaunch), done.keys, rangesComplete ) ); + actors.add( tag( delay(0, TaskPriority::DataDistributionLaunch), done.keys, rangesComplete ) ); if( g_network->isSimulated() && debug_isCheckRelocationDuration() && now() - done.startTime > 60 ) { TraceEvent(SevWarnAlways, "RelocationDurationTooLong").detail("Duration", now() - done.startTime); debug_setCheckRelocationDuration(false); diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index c4c8329754..ca4a849a33 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -140,7 +140,7 @@ ACTOR Future trackShardBytes( Reference>> shardSize, bool addToSizeEstimate = true) { - wait( delay( 0, TaskDataDistribution ) ); + wait( delay( 0, TaskPriority::DataDistribution ) ); /*TraceEvent("TrackShardBytesStarting") .detail("TrackerID", trackerID) @@ -260,7 +260,7 @@ ACTOR Future changeSizes( DataDistributionTracker* self, KeyRangeRef keys, } wait( waitForAll( sizes ) ); - wait( yield(TaskDataDistribution) ); + wait( yield(TaskPriority::DataDistribution) ); int64_t newShardsStartingSize = 0; for ( int i = 0; i < sizes.size(); i++ ) @@ -281,7 +281,7 @@ struct HasBeenTrueFor : NonCopyable { Future set() { if( !trigger.isValid() ) { cleared = Promise(); - trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, TaskDataDistribution - 1 ) || cleared.getFuture(); + trigger = delayJittered( SERVER_KNOBS->DD_MERGE_COALESCE_DELAY, decrementPriority(TaskPriority::DataDistribution) ) || cleared.getFuture(); } return trigger; } @@ -361,7 +361,7 @@ ACTOR Future shardSplitter( self->sizeChanges.add( changeSizes( self, keys, shardSize->get().get().bytes ) ); } else { - wait( delay(1.0, TaskDataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers + wait( delay(1.0, TaskPriority::DataDistribution) ); //In case the reason the split point was off was due to a discrepancy between storage servers } return Void(); } @@ -529,7 +529,7 @@ ACTOR Future shardTracker( wait( yieldedFuture(self->maxShardSize->onChange()) ); // Since maxShardSize will become present for all shards at once, avoid slow tasks with a short delay - wait( delay( 0, TaskDataDistribution ) ); + wait( delay( 0, TaskPriority::DataDistribution ) ); /*TraceEvent("ShardTracker", self->distributorId) .detail("Begin", keys.begin) @@ -546,7 +546,7 @@ ACTOR Future shardTracker( // We could have a lot of actors being released from the previous wait at the same time. Immediately calling // delay(0) mitigates the resulting SlowTask - wait( delay(0, TaskDataDistribution) ); + wait( delay(0, TaskPriority::DataDistribution) ); } } catch (Error& e) { if (e.code() != error_code_actor_cancelled) @@ -593,12 +593,12 @@ ACTOR Future trackInitialShards(DataDistributionTracker *self, Referenceshards.size()-1; s++) { restartShardTrackers( self, KeyRangeRef( initData->shards[s].key, initData->shards[s+1].key ) ); - wait( yield( TaskDataDistribution ) ); + wait( yield( TaskPriority::DataDistribution ) ); } Future initialSize = changeSizes( self, KeyRangeRef(allKeys.begin, allKeys.end), 0 ); diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 4a9bee5c98..6797d87a77 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -69,6 +69,7 @@ // // Retrieves the previously stored boolean // bool getPrefixSource() const; // +#pragma pack(push,1) template struct DeltaTree { @@ -76,36 +77,47 @@ struct DeltaTree { return std::numeric_limits::max(); }; -#pragma pack(push,1) struct Node { OffsetT leftChildOffset; OffsetT rightChildOffset; - DeltaT delta[0]; + + inline DeltaT & delta() { + return *(DeltaT *)(this + 1); + }; + + inline const DeltaT & delta() const { + return *(const DeltaT *)(this + 1); + }; Node * rightChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta->size()); - return rightChildOffset == 0 ? nullptr : (Node *)((uint8_t *)delta + rightChildOffset); + //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); + return rightChildOffset == 0 ? nullptr : (Node *)((uint8_t *)&delta() + rightChildOffset); } Node * leftChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta->size()); - return leftChildOffset == 0 ? nullptr : (Node *)((uint8_t *)delta + leftChildOffset); + //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); + return leftChildOffset == 0 ? nullptr : (Node *)((uint8_t *)&delta() + leftChildOffset); } int size() const { - return sizeof(Node) + delta->size(); + return sizeof(Node) + delta().size(); } }; -#pragma pack(pop) -#pragma pack(push,1) struct { OffsetT nodeBytes; // Total size of all Nodes including the root uint8_t initialDepth; // Levels in the tree as of the last rebuild - Node root[0]; }; #pragma pack(pop) + inline Node & root() { + return *(Node *)(this + 1); + } + + inline const Node & root() const { + return *(const Node *)(this + 1); + } + int size() const { return sizeof(DeltaTree) + nodeBytes; } @@ -119,18 +131,18 @@ public: struct DecodedNode { DecodedNode(Node *raw, const T *prev, const T *next, Arena &arena) : raw(raw), parent(nullptr), left(nullptr), right(nullptr), prev(prev), next(next), - item(raw->delta->apply(raw->delta->getPrefixSource() ? *prev : *next, arena)) + item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) { - //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta->toString().c_str()); + //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); } DecodedNode(Node *raw, DecodedNode *parent, bool left, Arena &arena) : parent(parent), raw(raw), left(nullptr), right(nullptr), prev(left ? parent->prev : &parent->item), next(left ? &parent->item : parent->next), - item(raw->delta->apply(raw->delta->getPrefixSource() ? *prev : *next, arena)) + item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) { - //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta->toString().c_str()); + //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); } Node *raw; @@ -175,7 +187,7 @@ public: lower = new(arena) T(arena, *lower); upper = new(arena) T(arena, *upper); - root = (tree->nodeBytes == 0) ? nullptr : new (arena) DecodedNode(tree->root, lower, upper, arena); + root = (tree->nodeBytes == 0) ? nullptr : new (arena) DecodedNode(&tree->root(), lower, upper, arena); } const T *lowerBound() const { @@ -330,7 +342,7 @@ public: // The boundary leading to the new page acts as the last time we branched right if(begin != end) { - nodeBytes = build(*root, begin, end, prev, next); + nodeBytes = build(root(), begin, end, prev, next); } else { nodeBytes = 0; @@ -341,7 +353,7 @@ public: private: static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next) { //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); - //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), root.delta); + //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), &root.delta()); ASSERT(end != begin); int count = end - begin; @@ -370,12 +382,12 @@ private: base = next; } - int deltaSize = item.writeDelta(*root.delta, *base, commonPrefix); - root.delta->setPrefixSource(prefixSourcePrev); - //printf("Serialized %s to %p\n", item.toString().c_str(), root.delta); + int deltaSize = item.writeDelta(root.delta(), *base, commonPrefix); + root.delta().setPrefixSource(prefixSourcePrev); + //printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta()); // Continue writing after the serialized Delta. - uint8_t *wptr = (uint8_t *)root.delta + deltaSize; + uint8_t *wptr = (uint8_t *)&root.delta() + deltaSize; // Serialize left child if(count > 1) { @@ -388,7 +400,7 @@ private: // Serialize right child if(count > 2) { - root.rightChildOffset = wptr - (uint8_t *)root.delta; + root.rightChildOffset = wptr - (uint8_t *)&root.delta(); wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next); } else { diff --git a/fdbserver/KeyValueStoreMemory.actor.cpp b/fdbserver/KeyValueStoreMemory.actor.cpp index 16eebb8a7a..796844f4cd 100644 --- a/fdbserver/KeyValueStoreMemory.actor.cpp +++ b/fdbserver/KeyValueStoreMemory.actor.cpp @@ -715,7 +715,7 @@ KeyValueStoreMemory::KeyValueStoreMemory( IDiskQueue* log, UID id, int64_t memor IKeyValueStore* keyValueStoreMemory( std::string const& basename, UID logID, int64_t memoryLimit, std::string ext ) { TraceEvent("KVSMemOpening", logID).detail("Basename", basename).detail("MemoryLimit", memoryLimit); - IDiskQueue *log = openDiskQueue( basename, ext, logID, DiskQueueVersion::V0 ); + IDiskQueue *log = openDiskQueue( basename, ext, logID, DiskQueueVersion::V1 ); return new KeyValueStoreMemory( log, logID, memoryLimit, false, false, false ); } diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index e53fa5a29a..7ce1a5c9b0 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -1937,8 +1937,8 @@ KeyValueStoreSQLite::KeyValueStoreSQLite(std::string const& filename, UID id, Ke readCursors.resize(64); //< number of read threads sqlite3_soft_heap_limit64( SERVER_KNOBS->SOFT_HEAP_LIMIT ); // SOMEDAY: Is this a performance issue? Should we drop the cache sizes for individual threads? - int taskId = g_network->getCurrentTask(); - g_network->setCurrentTask(TaskDiskWrite); + TaskPriority taskId = g_network->getCurrentTask(); + g_network->setCurrentTask(TaskPriority::DiskWrite); writeThread->addThread( new Writer(filename, type==KeyValueStoreType::SSD_BTREE_V2, checkChecksums, checkIntegrity, writesComplete, springCleaningStats, diskBytesUsed, freeListPages, id, &readCursors) ); g_network->setCurrentTask(taskId); auto p = new Writer::InitAction(); @@ -1963,8 +1963,8 @@ StorageBytes KeyValueStoreSQLite::getStorageBytes() { void KeyValueStoreSQLite::startReadThreads() { int nReadThreads = readCursors.size(); - int taskId = g_network->getCurrentTask(); - g_network->setCurrentTask(TaskDiskRead); + TaskPriority taskId = g_network->getCurrentTask(); + g_network->setCurrentTask(TaskPriority::DiskRead); for(int i=0; iaddThread( new Reader(filename, type==KeyValueStoreType::SSD_BTREE_V2, readsComplete, logID, &readCursors[i]) ); g_network->setCurrentTask(taskId); diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 74f095d18a..c87a8bd074 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -413,6 +413,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( BUGGIFY_BLOCK_BYTES, 10000 ); init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000; init( STORAGE_DURABILITY_LAG_REJECT_THRESHOLD, 0.25 ); + init( STORAGE_DURABILITY_LAG_MIN_RATE, 0.1 ); init( STORAGE_COMMIT_INTERVAL, 0.5 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_INTERVAL = 2.0; init( UPDATE_SHARD_VERSION_INTERVAL, 0.25 ); if( randomize && BUGGIFY ) UPDATE_SHARD_VERSION_INTERVAL = 1.0; init( BYTE_SAMPLING_FACTOR, 250 ); //cannot buggify because of differences in restarting tests diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index e61a3a1c55..9101c7de72 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -352,6 +352,7 @@ public: int64_t STORAGE_DURABILITY_LAG_HARD_MAX; int64_t STORAGE_DURABILITY_LAG_SOFT_MAX; double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD; + double STORAGE_DURABILITY_LAG_MIN_RATE; int STORAGE_COMMIT_BYTES; double STORAGE_COMMIT_INTERVAL; double UPDATE_SHARD_VERSION_INTERVAL; diff --git a/fdbserver/LeaderElection.actor.cpp b/fdbserver/LeaderElection.actor.cpp index 3cc50609d3..5a97b6358f 100644 --- a/fdbserver/LeaderElection.actor.cpp +++ b/fdbserver/LeaderElection.actor.cpp @@ -30,7 +30,7 @@ Optional> getLeader( const vector submitCandidacy( Key key, LeaderElectionRegInterface coord, LeaderInfo myInfo, UID prevChangeID, Reference>>> nominees, int index ) { loop { auto const& nom = nominees->get()[index]; - Optional li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskCoordinationReply ) ); + Optional li = wait( retryBrokenPromise( coord.candidacy, CandidacyRequest( key, myInfo, nom.present() ? nom.get().changeID : UID(), prevChangeID ), TaskPriority::CoordinationReply ) ); if (li != nominees->get()[index]) { vector> v = nominees->get(); @@ -150,7 +150,7 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu // we might be breaking the leader election process for someone with better communications but lower ID, so change IDs. if ((!leader.present() || !leader.get().second) && std::count( nominees->get().begin(), nominees->get().end(), myInfo )) { if (!badCandidateTimeout.isValid()) - badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskCoordinationReply ); + badCandidateTimeout = delay( SERVER_KNOBS->POLLING_FREQUENCY*2, TaskPriority::CoordinationReply ); } else badCandidateTimeout = Future(); @@ -183,12 +183,12 @@ ACTOR Future tryBecomeLeaderInternal(ServerCoordinators coordinators, Valu state vector> true_heartbeats; state vector> false_heartbeats; for(int i=0; i hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskCoordinationReply ); + Future hb = retryBrokenPromise( coordinators.leaderElectionServers[i].leaderHeartbeat, LeaderHeartbeatRequest( coordinators.clusterKey, myInfo, prevChangeID ), TaskPriority::CoordinationReply ); true_heartbeats.push_back( onEqual(hb, true) ); false_heartbeats.push_back( onEqual(hb, false) ); } - state Future rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskCoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side? + state Future rate = delay( SERVER_KNOBS->HEARTBEAT_FREQUENCY, TaskPriority::CoordinationReply ) || asyncPriorityInfo->onChange(); // SOMEDAY: Move to server side? choose { when ( wait( quorum( true_heartbeats, true_heartbeats.size()/2+1 ) ) ) { diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 5ab13ee1a8..bbabe5a71c 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -51,7 +51,7 @@ struct LogRouterData { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, LogRouterData *tlogData, TaskPriority taskID ) { while(!self->version_messages.empty() && self->version_messages.front().first < before) { Version version = self->version_messages.front().first; int64_t messagesErased = 0; @@ -68,7 +68,7 @@ struct LogRouterData { return Void(); } - Future eraseMessagesBefore(Version before, LogRouterData *tlogData, int taskID) { + Future eraseMessagesBefore(Version before, LogRouterData *tlogData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, taskID); } }; @@ -197,7 +197,7 @@ ACTOR Future waitForVersion( LogRouterData *self, Version ver ) { while(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS < ver) { if(self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS > self->version.get()) { self->version.set( self->minPopped.get() + SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); } else { wait(self->minPopped.whenAtLeast((self->minPopped.get()+1))); } @@ -220,7 +220,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { loop { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { //FIXME: does this actually happen? @@ -247,7 +247,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { commitMessages(self, ver, messages); self->version.set( ver ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); //TraceEvent("LogRouterVersion").detail("Ver",ver); } lastVer = ver; @@ -260,7 +260,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { wait( waitForVersion(self, ver) ); self->version.set( ver ); - wait(yield(TaskTLogCommit)); + wait(yield(TaskPriority::TLogCommit)); } break; } @@ -371,7 +371,7 @@ ACTOR Future logRouterPop( LogRouterData* self, TLogPopRequest req ) { } else if (req.to > tagData->popped) { tagData->popped = req.to; tagData->durableKnownCommittedVersion = req.durableKnownCommittedVersion; - wait(tagData->eraseMessagesBefore( req.to, self, TaskTLogPop )); + wait(tagData->eraseMessagesBefore( req.to, self, TaskPriority::TLogPop )); } state Version minPopped = std::numeric_limits::max(); @@ -385,7 +385,7 @@ ACTOR Future logRouterPop( LogRouterData* self, TLogPopRequest req ) { while(!self->messageBlocks.empty() && self->messageBlocks.front().first < minPopped) { self->messageBlocks.pop_front(); - wait(yield(TaskTLogPop)); + wait(yield(TaskPriority::TLogPop)); } self->poppedVersion = std::min(minKnownCommittedVersion, self->minKnownCommittedVersion); diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 559be4192b..2540d52a8b 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -341,7 +341,7 @@ struct ILogSystem { //returns immediately if hasMessage() returns true. //returns when either the result of hasMessage() or version() has changed, or a cursor has internally been exhausted. - virtual Future getMore(int taskID = TaskTLogPeekReply) = 0; + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0; //returns when the failure monitor detects that the servers associated with the cursor are failed virtual Future onFailed() = 0; @@ -407,7 +407,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -455,7 +455,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -500,7 +500,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -534,7 +534,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); @@ -594,7 +594,7 @@ struct ILogSystem { virtual StringRef getMessageWithTags(); virtual const std::vector& getTags(); virtual void advanceTo(LogMessageVersion n); - virtual Future getMore(int taskID = TaskTLogPeekReply); + virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); virtual bool isActive(); virtual bool isExhausted(); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index f32c3aa5cd..797aa85a13 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -133,7 +133,7 @@ void ILogSystem::ServerPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) { +ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) { if( !self->interf || self->messageVersion >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -198,7 +198,7 @@ ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self } } -ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int taskID ) { +ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, TaskPriority taskID ) { if( !self->interf || self->messageVersion >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -234,7 +234,7 @@ ACTOR Future serverPeekGetMore( ILogSystem::ServerPeekCursor* self, int ta } } -Future ILogSystem::ServerPeekCursor::getMore(int taskID) { +Future ILogSystem::ServerPeekCursor::getMore(TaskPriority taskID) { //TraceEvent("SPC_GetMore", randomID).detail("HasMessage", hasMessage()).detail("More", !more.isValid() || more.isReady()).detail("MessageVersion", messageVersion.toString()).detail("End", end.toString()); if( hasMessage() ) return Void(); @@ -444,7 +444,7 @@ void ILogSystem::MergedPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, int taskID) { +ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) { loop { //TraceEvent("MPC_GetMoreA", self->randomID).detail("Start", startVersion.toString()); if(self->bestServer >= 0 && self->serverCursors[self->bestServer]->isActive()) { @@ -465,7 +465,7 @@ ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess } } -Future ILogSystem::MergedPeekCursor::getMore(int taskID) { +Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { if(!serverCursors.size()) return Never(); @@ -705,7 +705,7 @@ void ILogSystem::SetPeekCursor::advanceTo(LogMessageVersion n) { } } -ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, int taskID) { +ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVersion startVersion, TaskPriority taskID) { loop { //TraceEvent("LPC_GetMore1", self->randomID).detail("Start", startVersion.toString()).detail("Tag", self->tag); if(self->bestServer >= 0 && self->bestSet >= 0 && self->serverCursors[self->bestSet][self->bestServer]->isActive()) { @@ -766,7 +766,7 @@ ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer } } -Future ILogSystem::SetPeekCursor::getMore(int taskID) { +Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { auto startVersion = version(); calcHasMessage(); if( hasMessage() ) @@ -861,7 +861,7 @@ void ILogSystem::MultiCursor::advanceTo(LogMessageVersion n) { cursors.back()->advanceTo(n); } -Future ILogSystem::MultiCursor::getMore(int taskID) { +Future ILogSystem::MultiCursor::getMore(TaskPriority taskID) { LogMessageVersion startVersion = cursors.back()->version(); while( cursors.size() > 1 && cursors.back()->version() >= epochEnds.back() ) { poppedVersion = std::max(poppedVersion, cursors.back()->popped()); @@ -977,7 +977,7 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) { ASSERT(false); } -ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, int taskID ) { +ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, TaskPriority taskID ) { loop { wait(yield()); if(cursor->version().version >= maxVersion) { @@ -994,7 +994,7 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe } } -ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID ) { +ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriority taskID ) { if( self->messageVersion.version >= self->end ) { wait( Future(Never())); throw internal_error(); @@ -1028,7 +1028,7 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, int taskID return Void(); } -Future ILogSystem::BufferedCursor::getMore(int taskID) { +Future ILogSystem::BufferedCursor::getMore(TaskPriority taskID) { if( hasMessage() ) return Void(); return bufferedGetMore(this, taskID); diff --git a/fdbserver/MasterInterface.h b/fdbserver/MasterInterface.h index 44674ec3bb..91a0d2444d 100644 --- a/fdbserver/MasterInterface.h +++ b/fdbserver/MasterInterface.h @@ -50,7 +50,7 @@ struct MasterInterface { } void initEndpoints() { - getCommitVersion.getEndpoint( TaskProxyGetConsistentReadVersion ); + getCommitVersion.getEndpoint( TaskPriority::ProxyGetConsistentReadVersion ); } }; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 3fc4665a15..8051ddb662 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -95,11 +95,11 @@ ACTOR Future getRate(UID myID, Reference> db, int64 loop choose { when ( wait( db->onChange() ) ) { if ( db->get().ratekeeper.present() ) { - TraceEvent("Proxy_RatekeeperChanged", myID) + TraceEvent("ProxyRatekeeperChanged", myID) .detail("RKID", db->get().ratekeeper.get().id()); nextRequestTimer = Void(); // trigger GetRate request } else { - TraceEvent("Proxy_RatekeeperDied", myID); + TraceEvent("ProxyRatekeeperDied", myID); nextRequestTimer = Never(); reply = Never(); } @@ -158,7 +158,7 @@ ACTOR Future queueTransactionStartRequests( if (now() - *lastGRVTime > *GRVBatchTime) *lastGRVTime = now() - *GRVBatchTime; - forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskProxyGRVTimer)); + forwardPromise(GRVTimer, delayJittered(*GRVBatchTime - (now() - *lastGRVTime), TaskPriority::ProxyGRVTimer)); } transactionQueue->push(std::make_pair(req, counter--)); @@ -263,7 +263,7 @@ struct ProxyCommitData { lastVersionTime(0), commitVersionRequestNumber(1), mostRecentProcessedRequestNumber(0), getConsistentReadVersion(getConsistentReadVersion), commit(commit), lastCoalesceTime(0), localCommitBatchesStarted(0), locked(false), commitBatchInterval(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_MIN), - firstProxy(firstProxy), cx(openDBOnServer(db, TaskDefaultEndpoint, true, true)), db(db), + firstProxy(firstProxy), cx(openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true)), db(db), singleKeyMutationEvent(LiteralStringRef("SingleKeyMutation")), commitBatchesMemBytesCount(0), lastTxsPop(0) {} }; @@ -350,7 +350,7 @@ struct ResolutionRequestBuilder { }; ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStream, int> > out, FutureStream in, int desiredBytes, int64_t memBytesLimit) { - wait(delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher)); + wait(delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher)); state double lastBatch = 0; @@ -363,7 +363,7 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamMAX_COMMIT_BATCH_INTERVAL, TaskProxyCommitBatcher); + timeout = delayJittered(SERVER_KNOBS->MAX_COMMIT_BATCH_INTERVAL, TaskPriority::ProxyCommitBatcher); } while(!timeout.isReady() && !(batch.size() == SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_COUNT_MAX || batchBytes >= desiredBytes)) { @@ -387,10 +387,10 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamcommitBatchStartNotifications.send(Void()); if(now() - lastBatch > commitData->commitBatchInterval) { - timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskProxyCommitBatcher); + timeout = delayJittered(SERVER_KNOBS->COMMIT_TRANSACTION_BATCH_INTERVAL_FROM_IDLE, TaskPriority::ProxyCommitBatcher); } else { - timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskProxyCommitBatcher); + timeout = delayJittered(commitData->commitBatchInterval - (now() - lastBatch), TaskPriority::ProxyCommitBatcher); } } @@ -398,7 +398,7 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStreamcommitBatchStartNotifications.send(Void()); - timeout = delayJittered(commitData->commitBatchInterval, TaskProxyCommitBatcher); + timeout = delayJittered(commitData->commitBatchInterval, TaskPriority::ProxyCommitBatcher); batch = std::vector(); batchBytes = 0; } @@ -457,7 +457,7 @@ ACTOR Future commitBatch( ASSERT(SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS <= SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT); // since we are using just the former to limit the number of versions actually in flight! // Active load balancing runs at a very high priority (to obtain accurate estimate of memory used by commit batches) so we need to downgrade here - wait(delay(0, TaskProxyCommit)); + wait(delay(0, TaskPriority::ProxyCommit)); self->lastVersionTime = t1; @@ -534,7 +534,7 @@ ACTOR Future commitBatch( vector< Future > replies; for (int r = 0; rresolvers.size(); r++) { requests.requests[r].debugID = debugID; - replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskProxyResolverReply))); + replies.push_back(brokenPromiseToNever(self->resolvers[r].resolve.getReply(requests.requests[r], TaskPriority::ProxyResolverReply))); } state vector> transactionResolverMap = std::move( requests.transactionResolverMap ); @@ -1135,7 +1135,7 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi state vector> proxyVersions; for (auto const& p : *otherProxies) - proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskTLogConfirmRunningReply))); + proxyVersions.push_back(brokenPromiseToNever(p.getRawCommittedVersion.getReply(GetRawCommittedVersionRequest(debugID), TaskPriority::TLogConfirmRunningReply))); if (!(flags&GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY)) { @@ -1292,7 +1292,7 @@ ACTOR static Future transactionStarter( } if (!transactionQueue.empty()) - forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskProxyGRVTimer)); + forwardPromise(GRVTimer, delayJittered(SERVER_KNOBS->START_TRANSACTION_BATCH_QUEUE_CHECK_INTERVAL, TaskPriority::ProxyGRVTimer)); /*TraceEvent("GRVBatch", proxy.id()) .detail("Elapsed", elapsed) diff --git a/fdbserver/MoveKeys.actor.cpp b/fdbserver/MoveKeys.actor.cpp index 4893f3c6a1..6a979e3cc5 100644 --- a/fdbserver/MoveKeys.actor.cpp +++ b/fdbserver/MoveKeys.actor.cpp @@ -130,12 +130,12 @@ ACTOR Future> addReadWriteDestinations(KeyRangeRef shard, vector> > srcChecks; for(int s=0; sSERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), srcInterfs[s].id(), 0 ) ); + srcChecks.push_back( checkReadWrite( srcInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), srcInterfs[s].id(), 0 ) ); } state vector< Future> > destChecks; for(int s=0; sSERVER_READY_QUORUM_INTERVAL, 0, TaskMoveKeys ), destInterfs[s].id(), version ) ); + destChecks.push_back( checkReadWrite( destInterfs[s].getShardState.getReplyUnlessFailedFor( GetShardStateRequest( shard, GetShardStateRequest::NO_WAIT), SERVER_KNOBS->SERVER_READY_QUORUM_INTERVAL, 0, TaskPriority::MoveKeys ), destInterfs[s].id(), version ) ); } wait( waitForAll(srcChecks) && waitForAll(destChecks) ); @@ -225,7 +225,7 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve state TraceInterval interval("RelocateShard_StartMoveKeys"); //state TraceInterval waitInterval(""); - wait( startMoveKeysLock->take( TaskDataDistributionLaunch ) ); + wait( startMoveKeysLock->take( TaskPriority::DataDistributionLaunch ) ); state FlowLock::Releaser releaser( *startMoveKeysLock ); TraceEvent(SevDebug, interval.begin(), relocationIntervalId); @@ -255,7 +255,7 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve //Keep track of shards for all src servers so that we can preserve their values in serverKeys state Map> shardMap; - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); wait( checkMoveKeysLock(&tr, lock) ); @@ -394,11 +394,11 @@ ACTOR Future startMoveKeys( Database occ, KeyRange keys, vector serve ACTOR Future waitForShardReady( StorageServerInterface server, KeyRange keys, Version minVersion, GetShardStateRequest::waitMode mode ) { loop { try { - std::pair rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskMoveKeys ) ); + std::pair rep = wait( server.getShardState.getReply( GetShardStateRequest(keys, mode), TaskPriority::MoveKeys ) ); if (rep.first >= minVersion) { return Void(); } - wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskMoveKeys ) ); + wait( delayJittered( SERVER_KNOBS->SHARD_READY_DELAY, TaskPriority::MoveKeys ) ); } catch (Error& e) { if( e.code() != error_code_timed_out ) { @@ -419,7 +419,7 @@ ACTOR Future checkFetchingState( Database cx, vector dest, KeyRange k try { if (BUGGIFY) wait(delay(5)); - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); vector< Future< Optional > > serverListEntries; @@ -439,7 +439,7 @@ ACTOR Future checkFetchingState( Database cx, vector dest, KeyRange k } wait( timeoutError( waitForAll( requests ), - SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskMoveKeys ) ); + SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, TaskPriority::MoveKeys ) ); dataMovementComplete.send(Void()); return Void(); @@ -480,11 +480,11 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest //printf("finishMoveKeys( '%s'-'%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { try { - tr.info.taskID = TaskMoveKeys; + tr.info.taskID = TaskPriority::MoveKeys; tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); releaser.release(); - wait( finishMoveKeysParallelismLock->take( TaskDataDistributionLaunch ) ); + wait( finishMoveKeysParallelismLock->take( TaskPriority::DataDistributionLaunch ) ); releaser = FlowLock::Releaser( *finishMoveKeysParallelismLock ); wait( checkMoveKeysLock(&tr, lock) ); @@ -632,7 +632,7 @@ ACTOR Future finishMoveKeys( Database occ, KeyRange keys, vector dest for(int s=0; sSERVER_READY_QUORUM_TIMEOUT, Void(), TaskMoveKeys ) ); + wait( timeout( waitForAll( serverReady ), SERVER_KNOBS->SERVER_READY_QUORUM_TIMEOUT, Void(), TaskPriority::MoveKeys ) ); int count = dest.size() - newDestinations.size(); for(int s=0; s removeStorageServer( Database cx, UID serverID, MoveKeysLock if (!canRemove) { TEST(true); // The caller had a transaction in flight that assigned keys to the server. Wait for it to reverse its mistake. TraceEvent(SevWarn,"NoCanRemove").detail("Count", noCanRemoveCount++).detail("ServerID", serverID); - wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskDataDistributionLaunch) ); + wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::DataDistributionLaunch) ); tr.reset(); TraceEvent("RemoveStorageServerRetrying").detail("CanRemove", canRemove); } else { diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index fd2be1f08f..bd8db636a1 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -333,7 +333,7 @@ namespace oldTLog_4_6 { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference tlogData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, int64_t* gBytesErased, Reference tlogData, TaskPriority taskID ) { while(!self->version_messages.empty() && self->version_messages.front().first < before) { Version version = self->version_messages.front().first; std::pair &sizes = tlogData->version_sizes[version]; @@ -359,7 +359,7 @@ namespace oldTLog_4_6 { return Void(); } - Future eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference tlogData, int taskID) { + Future eraseMessagesBefore(Version before, int64_t* gBytesErased, Reference tlogData, TaskPriority taskID) { return eraseMessagesBefore(this, before, gBytesErased, tlogData, taskID); } }; @@ -526,21 +526,21 @@ namespace oldTLog_4_6 { self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tag->key, currentVersion ), wr.toValue() ) ); - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tag->value.version_messages.begin(), tag->value.version_messages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); } } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } self->persistentData->set( KeyValueRef( BinaryWriter::toValue(logData->logId,Unversioned()).withPrefix(persistCurrentVersionKeys.begin), BinaryWriter::toValue(newPersistentDataVersion, Unversioned()) ) ); logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -548,20 +548,20 @@ namespace oldTLog_4_6 { logData->persistentDataDurableVersion = newPersistentDataVersion; for(tag = logData->tag_data.begin(); tag != logData->tag_data.end(); ++tag) { - wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(tag->value.eraseMessagesBefore( newPersistentDataVersion+1, &self->bytesDurable, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -586,7 +586,7 @@ namespace oldTLog_4_6 { } if(!self->queueOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -621,14 +621,14 @@ namespace oldTLog_4_6 { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { self->updatePersist = updatePersistentData(self, logData, nextVersion); wait( self->updatePersist ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -639,9 +639,9 @@ namespace oldTLog_4_6 { if(logData->persistentDataDurableVersion == logData->version.get()) { self->queueOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -650,7 +650,7 @@ namespace oldTLog_4_6 { while( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end() && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) ) { - wait( yield(TaskUpdateStorage) ); + wait( yield(TaskPriority::UpdateStorage) ); ++sizeItr; nextVersion = sizeItr == logData->version_sizes.end() ? logData->version.get() : sizeItr->key; @@ -662,7 +662,7 @@ namespace oldTLog_4_6 { totalSize += it->second.expectedSize(); } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } prevVersion = nextVersion; @@ -673,7 +673,7 @@ namespace oldTLog_4_6 { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { self->updatePersist = updatePersistentData(self, logData, nextVersion); @@ -681,21 +681,21 @@ namespace oldTLog_4_6 { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -823,7 +823,7 @@ namespace oldTLog_4_6 { ti->value.popped_recently = true; //if (to.epoch == self->epoch()) if ( req.to > logData->persistentDataDurableVersion ) - wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskTLogPop )); + wait(ti->value.eraseMessagesBefore( req.to, &self->bytesDurable, logData, TaskPriority::TLogPop )); } req.reply.send(Void()); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 227578d49f..68df56b92f 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -297,7 +297,7 @@ struct TLogData : NonCopyable { concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() { - cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); } }; @@ -323,7 +323,7 @@ struct LogData : NonCopyable, public ReferenceCounted { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, TaskPriority taskID ) { while(!self->versionMessages.empty() && self->versionMessages.front().first < before) { Version version = self->versionMessages.front().first; std::pair &sizes = logData->version_sizes[version]; @@ -352,7 +352,7 @@ struct LogData : NonCopyable, public ReferenceCounted { return Void(); } - Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, int taskID) { + Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, logData, taskID); } }; @@ -607,14 +607,14 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD self->persistentData->set( KeyValueRef( persistTagMessagesKey( logData->logId, tagData->tag, currentVersion ), wr.toValue() ) ); - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); } } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } } } @@ -624,7 +624,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -634,22 +634,22 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) { for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { if(logData->tag_data[tagLocality][tagId]) { - wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } } } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -674,7 +674,7 @@ ACTOR Future updateStorage( TLogData* self ) { } if(!self->queueOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -698,7 +698,7 @@ ACTOR Future updateStorage( TLogData* self ) { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { @@ -707,7 +707,7 @@ ACTOR Future updateStorage( TLogData* self ) { wait( updatePersistentData(self, logData, nextVersion) ); commitLockReleaser.release(); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -718,9 +718,9 @@ ACTOR Future updateStorage( TLogData* self ) { if(logData->persistentDataDurableVersion == logData->version.get()) { self->queueOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -741,7 +741,7 @@ ACTOR Future updateStorage( TLogData* self ) { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { wait( self->persistentDataCommitLock.take() ); @@ -751,21 +751,21 @@ ACTOR Future updateStorage( TLogData* self ) { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -943,7 +943,7 @@ ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere } if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop)); + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); } return Void(); @@ -1059,7 +1059,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if( req.tag.locality == tagLocalityLogRouter ) { wait( self->concurrentLogRouterReads.take() ); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait( delay(0.0, TaskLowPriority) ); + wait( delay(0.0, TaskPriority::Low) ); } if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { @@ -1068,7 +1068,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere // slightly faster over keeping the rest of the cluster operating normally. // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests // that impact recovery duration. - wait(delay(0, TaskTLogSpilledPeekReply)); + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } Version poppedVer = poppedVersion(logData, req.tag); @@ -1182,7 +1182,7 @@ ACTOR Future watchDegraded(TLogData* self) { //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask state int loopCount = 0; while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { - wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low)); loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); @@ -1518,7 +1518,7 @@ ACTOR Future tLogCommit( .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } // while exec op is being committed, no new transactions will be admitted. @@ -1858,7 +1858,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st while (!endVersion.present() || logData->version.get() < endVersion.get()) { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { @@ -1881,7 +1881,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } state Version ver = 0; @@ -1921,7 +1921,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } lastVer = ver; ver = r->version().version; @@ -1958,7 +1958,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } break; } diff --git a/fdbserver/Orderer.actor.h b/fdbserver/Orderer.actor.h index cd9d3d5a19..71f970ce45 100644 --- a/fdbserver/Orderer.actor.h +++ b/fdbserver/Orderer.actor.h @@ -38,7 +38,7 @@ public: ready = NotifiedVersion(s); started = false; } - Future order( Seq s, int taskID = TaskDefaultYield ) { + Future order( Seq s, TaskPriority taskID = TaskPriority::DefaultYield ) { if ( ready.get() < s ) return waitAndOrder( this, s, taskID ); else @@ -54,7 +54,7 @@ public: return ready.whenAtLeast(v); } private: - ACTOR static Future waitAndOrder( Orderer* self, Seq s, int taskID ) { + ACTOR static Future waitAndOrder( Orderer* self, Seq s, TaskPriority taskID ) { wait( self->ready.whenAtLeast(s) ); wait( yield( taskID ) || self->shutdown.getFuture() ); return self->dedup(s); diff --git a/fdbserver/QuietDatabase.actor.cpp b/fdbserver/QuietDatabase.actor.cpp index 3e983c5db8..771e8f47d0 100644 --- a/fdbserver/QuietDatabase.actor.cpp +++ b/fdbserver/QuietDatabase.actor.cpp @@ -292,6 +292,15 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr int64_t maxMachineTeamNumber = boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeams")); // TODO: Get finer granularity check + int64_t minServerTeamOnServer = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MinTeamNumberOnServer")); + int64_t maxServerTeamOnServer = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxTeamNumberOnServer")); + int64_t minMachineTeamOnMachine = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MinMachineTeamNumberOnMachine")); + int64_t maxMachineTeamOnMachine = + boost::lexical_cast(teamCollectionInfoMessage.getValue("MaxMachineTeamNumberOnMachine")); + // Team number is always valid when we disable teamRemover. This avoids false positive in simulation test if (SERVER_KNOBS->TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER || SERVER_KNOBS->TR_FLAG_DISABLE_SERVER_TEAM_REMOVER) { TraceEvent("GetTeamCollectionValid") @@ -302,7 +311,10 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr // The if condition should be consistent with the condition in teamRemover() that decides // if redundant teams exist. - if (healthyMachineTeamCount > desiredMachineTeamNumber || currentTeamNumber > desiredTeamNumber) { + if (healthyMachineTeamCount > desiredMachineTeamNumber || currentTeamNumber > desiredTeamNumber + (minMachineTeamOnMachine <= 0 && SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER == 3)) { + // When DESIRED_TEAMS_PER_SERVER == 1, we see minMachineTeamOnMachine can be 0 in one out of 30k test + // cases. Only check DESIRED_TEAMS_PER_SERVER == 3 for now since it is mostly used configuration. TraceEvent("GetTeamCollectionValid") .detail("CurrentTeamNumber", currentTeamNumber) .detail("DesiredTeamNumber", desiredTeamNumber) @@ -310,7 +322,13 @@ ACTOR Future getTeamCollectionValid(Database cx, WorkerInterface dataDistr .detail("CurrentHealthyMachineTeamNumber", healthyMachineTeamCount) .detail("DesiredMachineTeams", desiredMachineTeamNumber) .detail("CurrentMachineTeamNumber", currentMachineTeamNumber) - .detail("MaxMachineTeams", maxMachineTeamNumber); + .detail("MaxMachineTeams", maxMachineTeamNumber) + .detail("MinTeamNumberOnServer", minServerTeamOnServer) + .detail("MaxTeamNumberOnServer", maxServerTeamOnServer) + .detail("MinMachineTeamNumberOnMachine", minMachineTeamOnMachine) + .detail("MaxMachineTeamNumberOnMachine", maxMachineTeamOnMachine) + .detail("DesiredTeamsPerServer", SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER) + .detail("MaxTeamsPerServer", SERVER_KNOBS->MAX_TEAMS_PER_SERVER); return false; } else { return true; diff --git a/fdbserver/Ratekeeper.actor.cpp b/fdbserver/Ratekeeper.actor.cpp index 3f6fbcb600..91f44ef99c 100644 --- a/fdbserver/Ratekeeper.actor.cpp +++ b/fdbserver/Ratekeeper.actor.cpp @@ -297,7 +297,7 @@ ACTOR Future trackEachStorageServer( ACTOR Future monitorServerListChange( Reference> dbInfo, PromiseStream< std::pair> > serverChanges) { - state Database db = openDBOnServer(dbInfo, TaskRatekeeper, true, true); + state Database db = openDBOnServer(dbInfo, TaskPriority::Ratekeeper, true, true); state std::map oldServers; state Transaction tr(db); @@ -618,7 +618,7 @@ void updateRate(RatekeeperData* self, RatekeeperLimits* limits) { } ACTOR Future configurationMonitor(Reference> dbInfo, DatabaseConfiguration* conf) { - state Database cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); @@ -650,7 +650,7 @@ ACTOR Future ratekeeper(RatekeeperInterface rkInterf, Reference err; state Future collection = actorCollection( self.addActor.getFuture() ); - TraceEvent("Ratekeeper_Starting", rkInterf.id()); + TraceEvent("RatekeeperStarting", rkInterf.id()); self.addActor.send( waitFailureServer(rkInterf.waitFailure.getFuture()) ); self.addActor.send( configurationMonitor(dbInfo, &self.configuration) ); @@ -732,7 +732,7 @@ ACTOR Future ratekeeper(RatekeeperInterface rkInterf, Reference resolveBatch( } } - if (check_yield(TaskDefaultEndpoint)) { - wait( delay( 0, TaskLowPriority ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right? - g_network->setCurrentTask(TaskDefaultEndpoint); + if (check_yield(TaskPriority::DefaultEndpoint)) { + wait( delay( 0, TaskPriority::Low ) || delay( SERVER_KNOBS->COMMIT_SLEEP_TIME ) ); // FIXME: Is this still right? + g_network->setCurrentTask(TaskPriority::DefaultEndpoint); } if (self->version.get() == req.prevVersion) { // Not a duplicate (check relies on no waiting between here and self->version.set() below!) diff --git a/fdbserver/ResolverInterface.h b/fdbserver/ResolverInterface.h index 2bb808d84b..65b46a5941 100644 --- a/fdbserver/ResolverInterface.h +++ b/fdbserver/ResolverInterface.h @@ -44,8 +44,8 @@ struct ResolverInterface { bool operator != ( ResolverInterface const& r ) const { return id() != r.id(); } NetworkAddress address() const { return resolve.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - metrics.getEndpoint( TaskResolutionMetrics ); - split.getEndpoint( TaskResolutionMetrics ); + metrics.getEndpoint( TaskPriority::ResolutionMetrics ); + split.getEndpoint( TaskPriority::ResolutionMetrics ); } template diff --git a/fdbserver/RestoreInterface.h b/fdbserver/RestoreInterface.h index 847e940d9b..670946d9ab 100644 --- a/fdbserver/RestoreInterface.h +++ b/fdbserver/RestoreInterface.h @@ -37,7 +37,7 @@ struct RestoreInterface { NetworkAddress address() const { return test.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - test.getEndpoint( TaskClusterController ); + test.getEndpoint( TaskPriority::ClusterController ); } template diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 81330eac10..95e14136f9 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -215,7 +215,7 @@ ACTOR Future simulatedFDBDRebooter(Referencec_str(), coordFolder->c_str()); wait(g_simulator.onProcess(process, - TaskDefaultYield)); // Now switch execution to the process on which we will run + TaskPriority::DefaultYield)); // Now switch execution to the process on which we will run state Future onShutdown = process->onShutdown(); try { @@ -1399,7 +1399,7 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot Standalone(deterministicRandom()->randomUniqueID().toString()), Optional>()), ProcessClass(ProcessClass::TesterClass, ProcessClass::CommandLineSource), "", ""), - TaskDefaultYield)); + TaskPriority::DefaultYield)); Sim2FileSystem::newFileSystem(); FlowTransport::createInstance(true, 1); if (tlsOptions->enabled()) { diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 47c61aeb9f..e4c31fdf3d 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1809,7 +1809,7 @@ ACTOR Future layerStatusFetcher(Database cx, JsonBuilderArray ACTOR Future lockedStatusFetcher(Reference> db, JsonBuilderArray *messages, std::set *incomplete_reasons) { state JsonBuilderObject statusObj; - state Database cx = openDBOnServer(db, TaskDefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware + state Database cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, false); // Open a new database connection that isn't lock-aware state Transaction tr(cx); state int timeoutSeconds = 5; state Future getTimeout = delay(timeoutSeconds); diff --git a/fdbserver/TLogInterface.h b/fdbserver/TLogInterface.h index 638d7ca41d..635d1e6d27 100644 --- a/fdbserver/TLogInterface.h +++ b/fdbserver/TLogInterface.h @@ -56,11 +56,11 @@ struct TLogInterface { bool operator == ( TLogInterface const& r ) const { return id() == r.id(); } NetworkAddress address() const { return peekMessages.getEndpoint().getPrimaryAddress(); } void initEndpoints() { - getQueuingMetrics.getEndpoint( TaskTLogQueuingMetrics ); - popMessages.getEndpoint( TaskTLogPop ); - peekMessages.getEndpoint( TaskTLogPeek ); - confirmRunning.getEndpoint( TaskTLogConfirmRunning ); - commit.getEndpoint( TaskTLogCommit ); + getQueuingMetrics.getEndpoint( TaskPriority::TLogQueuingMetrics ); + popMessages.getEndpoint( TaskPriority::TLogPop ); + peekMessages.getEndpoint( TaskPriority::TLogPeek ); + confirmRunning.getEndpoint( TaskPriority::TLogConfirmRunning ); + commit.getEndpoint( TaskPriority::TLogCommit ); } template diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 96a63c1d39..6ac481c6fa 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -349,7 +349,7 @@ struct TLogData : NonCopyable { concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() { - cx = openDBOnServer(dbInfo, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(dbInfo, TaskPriority::DefaultEndpoint, true, true); } }; @@ -379,7 +379,7 @@ struct LogData : NonCopyable, public ReferenceCounted { } // Erase messages not needed to update *from* versions >= before (thus, messages with toversion <= before) - ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, int taskID ) { + ACTOR Future eraseMessagesBefore( TagData *self, Version before, TLogData *tlogData, Reference logData, TaskPriority taskID ) { while(!self->versionMessages.empty() && self->versionMessages.front().first < before) { Version version = self->versionMessages.front().first; std::pair &sizes = logData->version_sizes[version]; @@ -408,7 +408,7 @@ struct LogData : NonCopyable, public ReferenceCounted { return Void(); } - Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, int taskID) { + Future eraseMessagesBefore(Version before, TLogData *tlogData, Reference logData, TaskPriority taskID) { return eraseMessagesBefore(this, before, tlogData, logData, taskID); } }; @@ -766,7 +766,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { state Reference tagData = logData->tag_data[tagLocality][tagId]; if(tagData) { - wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskUpdateStorage )); + wait(tagData->eraseMessagesBefore( tagData->popped, self, logData, TaskPriority::UpdateStorage )); state Version currentVersion = 0; // Clear recently popped versions from persistentData if necessary updatePersistentPopped( self, logData, tagData ); @@ -819,7 +819,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD wr << uint32_t(0); } - Future f = yield(TaskUpdateStorage); + Future f = yield(TaskPriority::UpdateStorage); if(!f.isReady()) { wait(f); msg = std::upper_bound(tagData->versionMessages.begin(), tagData->versionMessages.end(), std::make_pair(currentVersion, LengthPrefixedStringRef()), CompareFirst>()); @@ -832,7 +832,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD tagData->poppedLocation = std::min(tagData->poppedLocation, firstLocation); } - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } } } @@ -847,7 +847,7 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD logData->persistentDataVersion = newPersistentDataVersion; wait( self->persistentData->commit() ); // SOMEDAY: This seems to be running pretty often, should we slow it down??? - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); // Now that the changes we made to persistentData are durable, erase the data we moved from memory and the queue, increase bytesDurable accordingly, and update persistentDataDurableVersion. @@ -857,22 +857,22 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD for(tagLocality = 0; tagLocality < logData->tag_data.size(); tagLocality++) { for(tagId = 0; tagId < logData->tag_data[tagLocality].size(); tagId++) { if(logData->tag_data[tagLocality][tagId]) { - wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskUpdateStorage )); - wait(yield(TaskUpdateStorage)); + wait(logData->tag_data[tagLocality][tagId]->eraseMessagesBefore( newPersistentDataVersion+1, self, logData, TaskPriority::UpdateStorage )); + wait(yield(TaskPriority::UpdateStorage)); } } } logData->version_sizes.erase(logData->version_sizes.begin(), logData->version_sizes.lower_bound(logData->persistentDataDurableVersion)); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); while(!logData->messageBlocks.empty() && logData->messageBlocks.front().first <= newPersistentDataVersion) { int64_t bytesErased = int64_t(logData->messageBlocks.front().second.size()) * SERVER_KNOBS->TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR; logData->bytesDurable += bytesErased; self->bytesDurable += bytesErased; logData->messageBlocks.pop_front(); - wait(yield(TaskUpdateStorage)); + wait(yield(TaskPriority::UpdateStorage)); } if(logData->bytesDurable.getValue() > logData->bytesInput.getValue() || self->bytesDurable > self->bytesInput) { @@ -915,7 +915,7 @@ ACTOR Future updateStorage( TLogData* self ) { } if(!self->spillOrder.size()) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); return Void(); } @@ -940,7 +940,7 @@ ACTOR Future updateStorage( TLogData* self ) { } wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); //TraceEvent("TlogUpdatePersist", self->dbgid).detail("LogId", logData->logId).detail("NextVersion", nextVersion).detail("Version", logData->version.get()).detail("PersistentDataDurableVer", logData->persistentDataDurableVersion).detail("QueueCommitVer", logData->queueCommittedVersion.get()).detail("PersistDataVer", logData->persistentDataVersion); if (nextVersion > logData->persistentDataVersion) { @@ -953,7 +953,7 @@ ACTOR Future updateStorage( TLogData* self ) { } commitLockReleaser.release(); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } if( logData->removed.isReady() ) { @@ -964,9 +964,9 @@ ACTOR Future updateStorage( TLogData* self ) { if(logData->persistentDataDurableVersion == logData->version.get()) { self->spillOrder.pop_front(); } - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } } else if(logData->initialized) { @@ -988,7 +988,7 @@ ACTOR Future updateStorage( TLogData* self ) { //TraceEvent("UpdateStorageVer", logData->logId).detail("NextVersion", nextVersion).detail("PersistentDataVersion", logData->persistentDataVersion).detail("TotalSize", totalSize); wait( logData->queueCommittedVersion.whenAtLeast( nextVersion ) ); - wait( delay(0, TaskUpdateStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); if (nextVersion > logData->persistentDataVersion) { wait( self->persistentDataCommitLock.take() ); @@ -1001,21 +1001,21 @@ ACTOR Future updateStorage( TLogData* self ) { } if( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT ) { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } else { //recovery wants to commit to persistant data when updatePersistentData is not active, this delay ensures that immediately after //updatePersist returns another one has not been started yet. - wait( delay(0.0, TaskUpdateStorage) ); + wait( delay(0.0, TaskPriority::UpdateStorage) ); } } else { - wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskUpdateStorage) ); + wait( delay(BUGGIFY ? SERVER_KNOBS->BUGGIFY_TLOG_STORAGE_MIN_UPDATE_INTERVAL : SERVER_KNOBS->TLOG_STORAGE_MIN_UPDATE_INTERVAL, TaskPriority::UpdateStorage) ); } return Void(); } ACTOR Future updateStorageLoop( TLogData* self ) { - wait(delay(0, TaskUpdateStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); loop { wait( updateStorage(self) ); @@ -1194,7 +1194,7 @@ ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Refere } if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskTLogPop)); + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); } return Void(); @@ -1346,7 +1346,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if( req.tag.locality == tagLocalityLogRouter ) { wait( self->concurrentLogRouterReads.take() ); state FlowLock::Releaser globalReleaser(self->concurrentLogRouterReads); - wait( delay(0.0, TaskLowPriority) ); + wait( delay(0.0, TaskPriority::Low) ); } if( req.begin <= logData->persistentDataDurableVersion && req.tag != txsTag) { @@ -1355,7 +1355,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere // slightly faster over keeping the rest of the cluster operating normally. // txsTag is only ever peeked on recovery, and we would still wish to prioritize requests // that impact recovery duration. - wait(delay(0, TaskTLogSpilledPeekReply)); + wait(delay(0, TaskPriority::TLogSpilledPeekReply)); } Version poppedVer = poppedVersion(logData, req.tag); @@ -1464,7 +1464,7 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere if (earlyEnd) break; } earlyEnd = earlyEnd || (kvrefs.size() >= SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_BATCHES_PER_PEEK+1); - wait( self->peekMemoryLimiter.take(TaskTLogSpilledPeekReply, commitBytes) ); + wait( self->peekMemoryLimiter.take(TaskPriority::TLogSpilledPeekReply, commitBytes) ); state FlowLock::Releaser memoryReservation(self->peekMemoryLimiter, commitBytes); state std::vector>> messageReads; messageReads.reserve( commitLocations.size() ); @@ -1556,7 +1556,7 @@ ACTOR Future watchDegraded(TLogData* self) { //This delay is divided into multiple delays to avoid marking the tlog as degraded because of a single SlowTask state int loopCount = 0; while(loopCount < SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT) { - wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskLowPriority)); + wait(delay(SERVER_KNOBS->TLOG_DEGRADED_DURATION/SERVER_KNOBS->TLOG_DEGRADED_DELAY_COUNT, TaskPriority::Low)); loopCount++; } TraceEvent(SevWarnAlways, "TLogDegraded", self->dbgid); @@ -1892,7 +1892,7 @@ ACTOR Future tLogCommit( .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } // while exec op is being committed, no new transactions will be admitted. @@ -2239,7 +2239,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st while (!endVersion.present() || logData->version.get() < endVersion.get()) { loop { choose { - when(wait( r ? r->getMore(TaskTLogCommit) : Never() ) ) { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { break; } when( wait( dbInfoChange ) ) { @@ -2262,7 +2262,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st .detail("PersistentDataDurableVersion", logData->persistentDataDurableVersion); waitStartT = now(); } - wait( delayJittered(.005, TaskTLogCommit) ); + wait( delayJittered(.005, TaskPriority::TLogCommit) ); } state Version ver = 0; @@ -2302,7 +2302,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } lastVer = ver; ver = r->version().version; @@ -2339,7 +2339,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( ver ); - wait( yield(TaskTLogCommit) ); + wait( yield(TaskPriority::TLogCommit) ); } break; } diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 2e25daae3b..15cb21b24a 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -431,7 +431,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted> tLogCommitResults; for(int loc=0; loc< it->logServers.size(); loc++) { Standalone msg = data.getMessages(location); - allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskTLogCommitReply ) ); + allReplies.push_back( it->logServers[loc]->get().interf().commit.getReply( TLogCommitRequest( msg.arena(), prevVersion, version, knownCommittedVersion, minKnownCommittedVersion, msg, data.getHasExecOp(), debugID ), TaskPriority::TLogCommitReply ) ); Future commitSuccess = success(allReplies.back()); addActor.get().send(commitSuccess); tLogCommitResults.push_back(commitSuccess); @@ -961,7 +961,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedget().present() ) { alive.push_back( brokenPromiseToNever( t->get().interf().confirmRunning.getReply( TLogConfirmRunningRequest(debugID), - TaskTLogConfirmRunningReply ) ) ); + TaskPriority::TLogConfirmRunningReply ) ) ); numPresent++; } else { alive.push_back( Never() ); @@ -1477,7 +1477,16 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedrejoins = rejoins; logSystem->lockResults = lockResults; logSystem->recoverAt = minEnd; - logSystem->knownCommittedVersion = knownCommittedVersion; + if (knownCommittedVersion > minEnd) { + // FIXME: Remove the Sev40 once disk snapshot v2 feature is enabled, in all other + // code paths we should never be here. + TraceEvent(SevError, "KCVIsInvalid") + .detail("KnownCommittedVersion", knownCommittedVersion) + .detail("MinEnd", minEnd); + logSystem->knownCommittedVersion = minEnd; + } else { + logSystem->knownCommittedVersion = knownCommittedVersion; + } logSystem->remoteLogsWrittenToCoreState = true; logSystem->stopped = true; logSystem->pseudoLocalities = prevState.pseudoLocalities; diff --git a/fdbserver/VFSAsync.cpp b/fdbserver/VFSAsync.cpp index 95e6b958a4..3d53aaccfb 100644 --- a/fdbserver/VFSAsync.cpp +++ b/fdbserver/VFSAsync.cpp @@ -713,7 +713,7 @@ static int asyncSleep(sqlite3_vfs *pVfs, int microseconds){ waitFor( delay(FLOW_KNOBS->MAX_BUGGIFIED_DELAY) ); return 0; } - waitFor( g_network->delay( microseconds*1e-6, TaskDefaultDelay ) || simCancel ); + waitFor( g_network->delay( microseconds*1e-6, TaskPriority::DefaultDelay ) || simCancel ); return microseconds; } catch( Error &e ) { TraceEvent(SevError, "AsyncSleepError").error(e,true); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5834687548..376ea8bdf0 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -431,7 +431,14 @@ struct RedwoodRecordRef { }; uint8_t flags; - byte data[]; + + inline byte * data() { + return (byte *)(this + 1); + } + + inline const byte * data() const { + return (const byte *)(this + 1); + } void setPrefixSource(bool val) { if(val) { @@ -447,7 +454,7 @@ struct RedwoodRecordRef { } RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { - Reader r(data); + Reader r(data()); int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; int prefixLen = r.readVarInt(); @@ -501,19 +508,19 @@ struct RedwoodRecordRef { } int size() const { - Reader r(data); + Reader r(data()); int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; r.readVarInt(); // prefixlen int valueLen = (flags & HAS_VALUE) ? r.read() : 0; int keySuffixLen = (flags & HAS_KEY_SUFFIX) ? r.readVarInt() : 0; - return sizeof(Delta) + r.rptr - data + intFieldSuffixLen + valueLen + keySuffixLen; + return sizeof(Delta) + r.rptr - data() + intFieldSuffixLen + valueLen + keySuffixLen; } // Delta can't be determined without the RedwoodRecordRef upon which the Delta is based. std::string toString() const { - Reader r(data); + Reader r(data()); std::string flagString = " "; if(flags & PREFIX_SOURCE) flagString += "prefixSource "; @@ -638,7 +645,7 @@ struct RedwoodRecordRef { commonPrefix = getCommonPrefixLen(base, 0); } - Writer w(d.data); + Writer w(d.data()); // prefixLen w.writeVarInt(commonPrefix); @@ -688,7 +695,7 @@ struct RedwoodRecordRef { w.writeString(value.get()); } - return w.wptr - d.data + sizeof(Delta); + return w.wptr - d.data() + sizeof(Delta); } template @@ -737,10 +744,17 @@ struct BTreePage { uint16_t count; uint32_t kvBytes; uint8_t extensionPageCount; - LogicalPageID extensionPages[0]; }; #pragma pack(pop) + inline LogicalPageID * extensionPages() { + return (LogicalPageID *)(this + 1); + } + + inline const LogicalPageID * extensionPages() const { + return (const LogicalPageID *)(this + 1); + } + int size() const { const BinaryTree *t = &tree(); return (uint8_t *)t - (uint8_t *)this + t->size(); @@ -751,15 +765,15 @@ struct BTreePage { } BinaryTree & tree() { - return *(BinaryTree *)(extensionPages + extensionPageCount); + return *(BinaryTree *)(extensionPages() + extensionPageCount); } const BinaryTree & tree() const { - return *(const BinaryTree *)(extensionPages + extensionPageCount); + return *(const BinaryTree *)(extensionPages() + extensionPageCount); } static inline int GetHeaderSize(int extensionPages = 0) { - return sizeof(BTreePage) + extensionPages + sizeof(LogicalPageID); + return sizeof(BTreePage) + (extensionPages * sizeof(LogicalPageID)); } std::string toString(bool write, LogicalPageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { @@ -1603,7 +1617,7 @@ private: for(int e = 0, eEnd = extPages.size(); e < eEnd; ++e) { LogicalPageID eid = m_pager->allocateLogicalPage(); debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages.size(), id); - newPage->extensionPages[e] = bigEndian32(eid); + newPage->extensionPages()[e] = bigEndian32(eid); // If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID m_pager->writePage(eid, extPages[e], version, (version == 0) ? id : invalidLogicalPageID); ++counts.extPageWrites; @@ -1620,8 +1634,8 @@ private: // Free the old extension pages now that all replacement pages have been written for(int i = 0; i < originalPage->extensionPageCount; ++i) { - //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages[i])); - //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages[i]), version); + //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i])); + //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version); } return primaryLogicalPageIDs; @@ -1670,7 +1684,7 @@ private: ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, int usablePageSize, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { debug_printf("readPage() op=read id=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - wait(delay(0, TaskDiskRead)); + wait(delay(0, TaskPriority::DiskRead)); state Reference result = wait(snapshot->getPhysicalPage(id)); ++counts.pageReads; @@ -1684,8 +1698,8 @@ private: pageGets.push_back(std::move(result)); for(int i = 0; i < pTreePage->extensionPageCount; ++i) { - debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); - pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages[i]))); + debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages()[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); + pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages()[i]))); } std::vector> pages = wait(getAll(pageGets)); @@ -3561,12 +3575,12 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { while(1) { if(fwd.get() != items[i]) { printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); - printf("Delta: %s\n", fwd.node->raw->delta->toString().c_str()); + printf("Delta: %s\n", fwd.node->raw->delta().toString().c_str()); ASSERT(false); } if(rev.get() != items[items.size() - 1 - i]) { printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); - printf("Delta: %s\n", rev.node->raw->delta->toString().c_str()); + printf("Delta: %s\n", rev.node->raw->delta().toString().c_str()); ASSERT(false); } ++i; diff --git a/fdbserver/WaitFailure.actor.cpp b/fdbserver/WaitFailure.actor.cpp index 778128f830..6ab6efeb74 100644 --- a/fdbserver/WaitFailure.actor.cpp +++ b/fdbserver/WaitFailure.actor.cpp @@ -37,7 +37,7 @@ ACTOR Future waitFailureServer(FutureStream> waitFailur } } -ACTOR Future waitFailureClient(RequestStream> waitFailure, double reactionTime, double reactionSlope, int taskID){ +ACTOR Future waitFailureClient(RequestStream> waitFailure, double reactionTime, double reactionSlope, TaskPriority taskID){ loop { try { state double start = now(); @@ -55,7 +55,7 @@ ACTOR Future waitFailureClient(RequestStream> waitFailu } } -ACTOR Future waitFailureClientStrict(RequestStream> waitFailure, double failureReactionTime, int taskID){ +ACTOR Future waitFailureClientStrict(RequestStream> waitFailure, double failureReactionTime, TaskPriority taskID){ loop { wait(waitFailureClient(waitFailure, 0, 0, taskID)); wait(delay(failureReactionTime, taskID) || IFailureMonitor::failureMonitor().onStateEqual( waitFailure.getEndpoint(), FailureStatus(false))); @@ -65,7 +65,7 @@ ACTOR Future waitFailureClientStrict(RequestStream> wai } } -ACTOR Future waitFailureTracker(RequestStream> waitFailure, Reference> failed, double reactionTime, double reactionSlope, int taskID){ +ACTOR Future waitFailureTracker(RequestStream> waitFailure, Reference> failed, double reactionTime, double reactionSlope, TaskPriority taskID){ loop { try { failed->set( IFailureMonitor::failureMonitor().getState(waitFailure.getEndpoint()).isFailed() ); diff --git a/fdbserver/WaitFailure.h b/fdbserver/WaitFailure.h index 9ef3b4c3a0..413dc9a56a 100644 --- a/fdbserver/WaitFailure.h +++ b/fdbserver/WaitFailure.h @@ -26,13 +26,13 @@ Future waitFailureServer(const FutureStream>& waitFailu // talks to a wait failure server, returns Void on failure Future waitFailureClient(const RequestStream>& waitFailure, - double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); + double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); // talks to a wait failure server, returns Void on failure, reaction time is always waited -Future waitFailureClientStrict(const RequestStream>& waitFailure, double const& failureReactionTime=0, int const& taskID=TaskDefaultEndpoint); +Future waitFailureClientStrict(const RequestStream>& waitFailure, double const& failureReactionTime=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); // talks to a wait failure server, updates failed to be true or false based on failure status. Future waitFailureTracker(const RequestStream>& waitFailure, Reference> const& failed, - double const& failureReactionTime=0, double const& failureReactionSlope=0, int const& taskID=TaskDefaultEndpoint); + double const& failureReactionTime=0, double const& failureReactionSlope=0, TaskPriority const& taskID=TaskPriority::DefaultEndpoint); -#endif \ No newline at end of file +#endif diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index ba33037759..27ad0f0967 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -369,6 +369,7 @@ struct Role { static const Role LOG_ROUTER; static const Role DATA_DISTRIBUTOR; static const Role RATEKEEPER; + static const Role COORDINATOR; std::string roleName; std::string abbreviation; @@ -392,7 +393,7 @@ void endRole(const Role &role, UID id, std::string reason, bool ok = true, Error struct ServerDBInfo; -class Database openDBOnServer( Reference> const& db, int taskID = TaskDefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); +class Database openDBOnServer( Reference> const& db, TaskPriority taskID = TaskPriority::DefaultEndpoint, bool enableLocalityLoadBalance = true, bool lockAware = false ); ACTOR Future extractClusterInterface(Reference>> a, Reference>> b); diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 14d47d0a07..8b765b65bc 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -494,7 +494,7 @@ Future startSystemMonitor(std::string dataFolder, OptionalgetLocalAddress().ip)); systemMonitor(); - return recurring( &systemMonitor, 5.0, TaskFlushTrace ); + return recurring( &systemMonitor, 5.0, TaskPriority::FlushTrace ); } void testIndexedSet(); diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 4d6122291d..d0ac5392a5 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -464,7 +464,7 @@ Future sendMasterRegistration( MasterData* self, LogSystemConfig const& lo } ACTOR Future updateRegistration( Reference self, Reference logSystem ) { - state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true); state Future trigger = self->registrationTrigger.onTrigger(); state Future updateLogsKey; @@ -1017,12 +1017,12 @@ ACTOR Future resolutionBalancing(Reference self) { state CoalescedKeyRangeMap key_resolver; key_resolver.insert(allKeys, 0); loop { - wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskResolutionMetrics)); + wait(delay(SERVER_KNOBS->MIN_BALANCE_TIME, TaskPriority::ResolutionMetrics)); while(self->resolverChanges.get().size()) wait(self->resolverChanges.onChange()); state std::vector> futures; for (auto& p : self->resolvers) - futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskResolutionMetrics))); + futures.push_back(brokenPromiseToNever(p.metrics.getReply(ResolutionMetricsRequest(), TaskPriority::ResolutionMetrics))); wait( waitForAll(futures) ); state IndexedSet, NoMetric> metrics; @@ -1047,7 +1047,7 @@ ACTOR Future resolutionBalancing(Reference self) { req.offset = amount; req.range = range.first; - ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskResolutionMetrics)) ); + ResolutionSplitReply split = wait( brokenPromiseToNever(self->resolvers[metrics.lastItem()->second].split.getReply(req, TaskPriority::ResolutionMetrics)) ); KeyRangeRef moveRange = range.second ? KeyRangeRef( range.first.begin, split.key ) : KeyRangeRef( split.key, range.first.end ); movedRanges.push_back_deep(movedRanges.arena(), ResolverMoveRef(moveRange, dest)); TraceEvent("MovingResolutionRange").detail("Src", src).detail("Dest", dest).detail("Amount", amount).detail("StartRange", range.first).detail("MoveRange", moveRange).detail("Used", split.used).detail("KeyResolverRanges", key_resolver.size()); @@ -1181,7 +1181,7 @@ ACTOR Future trackTlogRecovery( Reference self, Reference configurationMonitor( Reference self ) { - state Database cx = openDBOnServer(self->dbInfo, TaskDefaultEndpoint, true, true); + state Database cx = openDBOnServer(self->dbInfo, TaskPriority::DefaultEndpoint, true, true); loop { state ReadYourWritesTransaction tr(cx); diff --git a/fdbserver/networktest.actor.cpp b/fdbserver/networktest.actor.cpp index 61bf80ed55..795dd769c5 100644 --- a/fdbserver/networktest.actor.cpp +++ b/fdbserver/networktest.actor.cpp @@ -30,7 +30,7 @@ NetworkTestInterface::NetworkTestInterface( NetworkAddress remote ) NetworkTestInterface::NetworkTestInterface( INetwork* local ) { - test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskDefaultEndpoint ); + test.makeWellKnownEndpoint( WLTOKEN_NETWORKTEST, TaskPriority::DefaultEndpoint ); } ACTOR Future networkTestServer() { diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 587c438460..300e0c4d9c 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -551,7 +551,7 @@ public: newestDirtyVersion.insert(allKeys, invalidVersion); addShard( ShardInfo::newNotAssigned( allKeys ) ); - cx = openDBOnServer(db, TaskDefaultEndpoint, true, true); + cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true); } //~StorageServer() { fclose(log); } @@ -643,7 +643,7 @@ public: template Future readGuard(const Request& request, const HandleFunction& fun) { auto rate = currentRate(); - if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD) { + if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > std::max(SERVER_KNOBS->STORAGE_DURABILITY_LAG_MIN_RATE, rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD)) { //request.error = future_version(); sendErrorWithPenalty(request.reply, server_overloaded(), getPenalty()); return Void(); @@ -829,7 +829,7 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); @@ -1345,7 +1345,7 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); try { if( req.debugID.present() ) @@ -1458,7 +1458,7 @@ ACTOR Future getKey( StorageServer* data, GetKeyRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - wait( delay(0, TaskDefaultEndpoint) ); + wait( delay(0, TaskPriority::DefaultEndpoint) ); try { state Version version = wait( waitForVersion( data, req.version ) ); @@ -2003,7 +2003,7 @@ ACTOR Future fetchKeys( StorageServer *data, AddingShard* shard ) { TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID); - wait( data->fetchKeysParallelismLock.take( TaskDefaultYield, fetchBlockBytes ) ); + wait( data->fetchKeysParallelismLock.take( TaskPriority::DefaultYield, fetchBlockBytes ) ); state FlowLock::Releaser holdingFKPL( data->fetchKeysParallelismLock, fetchBlockBytes ); state double executeStart = now(); @@ -2590,7 +2590,7 @@ ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) } data->behind = true; - wait( delayJittered(.005, TaskTLogPeekReply) ); + wait( delayJittered(.005, TaskPriority::TLogPeekReply) ); } while( data->byteSampleClearsTooLarge.get() ) { @@ -2617,7 +2617,7 @@ ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) *pReceivedUpdate = true; start = now(); - wait( data->durableVersionLock.take(TaskTLogPeekReply,1) ); + wait( data->durableVersionLock.take(TaskPriority::TLogPeekReply,1) ); state FlowLock::Releaser holdingDVL( data->durableVersionLock ); if(now() - start > 0.1) TraceEvent("SSSlowTakeLock1", data->thisServerID).detailf("From", "%016llx", debug_lastLoadBalanceResultEndpointToken).detail("Duration", now() - start).detail("Version", data->version.get()); @@ -2865,11 +2865,11 @@ ACTOR Future updateStorage(StorageServer* data) { if (g_network->isSimulated()) { double endTime = g_simulator.checkDisabled(format("%s/updateStorage", data->thisServerID.toString().c_str())); if(endTime > now()) { - wait(delay(endTime - now(), TaskStorage)); + wait(delay(endTime - now(), TaskPriority::UpdateStorage)); } } wait( data->desiredOldestVersion.whenAtLeast( data->storageVersion()+1 ) ); - wait( delay(0, TaskStorage) ); + wait( delay(0, TaskPriority::UpdateStorage) ); state Promise durableInProgress; data->durableInProgress = durableInProgress.getFuture(); @@ -2884,10 +2884,10 @@ ACTOR Future updateStorage(StorageServer* data) { state bool done = data->storage.makeVersionMutationsDurable(newOldestVersion, desiredVersion, bytesLeft); // We want to forget things from these data structures atomically with changing oldestVersion (and "before", since oldestVersion.set() may trigger waiting actors) // forgetVersionsBeforeAsync visibly forgets immediately (without waiting) but asynchronously frees memory. - Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskStorage ); + Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskPriority::UpdateStorage ); data->oldestVersion.set( newOldestVersion ); wait( finishedForgetting ); - wait( yield(TaskStorage) ); + wait( yield(TaskPriority::UpdateStorage) ); if (done) break; } @@ -2900,9 +2900,7 @@ ACTOR Future updateStorage(StorageServer* data) { state Future durableDelay = Void(); if (bytesLeft > 0) { - durableDelay = delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskStorage); - } else { - durableDelay = delay(0, TaskUpdateStorage) || delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskStorage); + durableDelay = delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskPriority::UpdateStorage); } wait( durable ); @@ -2922,7 +2920,7 @@ ACTOR Future updateStorage(StorageServer* data) { } durableInProgress.send(Void()); - wait( delay(0, TaskStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation + wait( delay(0, TaskPriority::UpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation // Taking and releasing the durableVersionLock ensures that no eager reads both begin before the commit was effective and // are applied after we change the durable version. Also ensure that we have to lock while calling changeDurableVersion, @@ -2931,9 +2929,9 @@ ACTOR Future updateStorage(StorageServer* data) { data->popVersion( data->durableVersion.get() + 1 ); while (!changeDurableVersion( data, newOldestVersion )) { - if(g_network->check_yield(TaskStorage)) { + if(g_network->check_yield(TaskPriority::UpdateStorage)) { data->durableVersionLock.release(); - wait(delay(0, TaskStorage)); + wait(delay(0, TaskPriority::UpdateStorage)); wait( data->durableVersionLock.take() ); } } @@ -3543,7 +3541,7 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac } } when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.recieved"); //.detail("TaskID", g_network->getCurrentTask()); @@ -3558,11 +3556,11 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac actors.add(self->readGuard(req, watchValueQ)); } when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work actors.add(self->readGuard(req , getKey)); } when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { - // Warning: This code is executed at extremely high priority (TaskLoadBalancedEndpoint), so downgrade before doing real work + // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work actors.add(self->readGuard(req , getKeyValues)); } when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { @@ -3644,7 +3642,7 @@ ACTOR Future memoryStoreRecover(IKeyValueStore* store, ReferenceREMOVE_RETRY_DELAY, TaskUpdateStorage) ); + wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::UpdateStorage) ); tr.reset(); TraceEvent("RemoveStorageServerRetrying").detail("Count", noCanRemoveCount++).detail("ServerID", id).detail("CanRemove", canRemove); } else { diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 68a3a48a91..f23d889c37 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -75,7 +75,7 @@ ACTOR static Future extractClientInfo( Reference> d } } -Database openDBOnServer( Reference> const& db, int taskID, bool enableLocalityLoadBalance, bool lockAware ) { +Database openDBOnServer( Reference> const& db, TaskPriority taskID, bool enableLocalityLoadBalance, bool lockAware ) { Reference> info( new AsyncVar ); return DatabaseContext::create( info, extractClientInfo(db, info), enableLocalityLoadBalance ? db->get().myLocality : LocalityData(), enableLocalityLoadBalance, taskID, lockAware ); } @@ -737,7 +737,7 @@ ACTOR Future workerServer( } } else { bool lockAware = metricsPrefix.size() && metricsPrefix[0] == '\xff'; - metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskDefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) ); + metricsLogger = runMetrics( openDBOnServer( dbInfo, TaskPriority::DefaultEndpoint, true, lockAware ), KeyRef(metricsPrefix) ); } } @@ -1176,7 +1176,7 @@ ACTOR Future workerServer( } when( wait( loggingTrigger ) ) { systemMonitor(); - loggingTrigger = delay( loggingDelay, TaskFlushTrace ); + loggingTrigger = delay( loggingDelay, TaskPriority::FlushTrace ); } when(state ExecuteRequest req = waitNext(interf.execReq.getFuture())) { state ExecCmdValueString execArg(req.execPayload); @@ -1403,3 +1403,4 @@ const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); const Role Role::RATEKEEPER("Ratekeeper", "RK"); +const Role Role::COORDINATOR("Coordinator", "CD"); diff --git a/fdbserver/workloads/Cycle.actor.cpp b/fdbserver/workloads/Cycle.actor.cpp index e622ffe4a3..62c80af294 100644 --- a/fdbserver/workloads/Cycle.actor.cpp +++ b/fdbserver/workloads/Cycle.actor.cpp @@ -28,6 +28,7 @@ struct CycleWorkload : TestWorkload { int actorCount, nodeCount; double testDuration, transactionsPerSecond, minExpectedTransactionsPerSecond; Key keyPrefix; + bool checkOnly; vector> clients; PerfIntCounter transactions, retries, tooOldRetries, commitFailedRetries; @@ -44,6 +45,7 @@ struct CycleWorkload : TestWorkload { nodeCount = getOption(options, LiteralStringRef("nodeCount"), transactionsPerSecond * clientCount); keyPrefix = getOption(options, LiteralStringRef("keyPrefix"), LiteralStringRef("")); minExpectedTransactionsPerSecond = transactionsPerSecond * getOption(options, LiteralStringRef("expectedRate"), 0.7); + checkOnly = getOption(options, LiteralStringRef("checkOnly"), false); } virtual std::string description() { return "CycleWorkload"; } @@ -51,6 +53,7 @@ struct CycleWorkload : TestWorkload { return bulkSetup( cx, this, nodeCount, Promise() ); } virtual Future start( Database const& cx ) { + if (checkOnly) return Void(); for(int c=0; c promise; auto f = promise.getFuture(); keepAlive(f, database); - workloadImpl->start(reinterpret_cast(database.getPtr()), + workloadImpl->check(reinterpret_cast(database.getPtr()), GenericPromise(new FDBPromiseImpl(promise))); return f; } diff --git a/flow/Arena.h b/flow/Arena.h index b5280bae99..90ff501d25 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -767,7 +767,7 @@ inline void save( Archive& ar, const StringRef& value ) { template<> struct dynamic_size_traits : std::true_type { - static WriteRawMemory save(const StringRef& str) { return { { unownedPtr(str.begin()), str.size() } }; } + static Block save(const StringRef& str) { return unownedPtr(str.begin(), str.size()); } template static void load(const uint8_t* ptr, size_t sz, StringRef& str, Context& context) { diff --git a/flow/IThreadPool.h b/flow/IThreadPool.h index 5da60d2930..39d5d484a8 100644 --- a/flow/IThreadPool.h +++ b/flow/IThreadPool.h @@ -92,12 +92,12 @@ public: void send( T const& t ) { // Can be called safely from another thread. Call send or sendError at most once. Promise signal; tagAndForward( &promise, t, signal.getFuture() ); - g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 ); + g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) ); } void sendError( Error const& e ) { // Can be called safely from another thread. Call send or sendError at most once. Promise signal; tagAndForwardError( &promise, e, signal.getFuture() ); - g_network->onMainThread( std::move(signal), g_network->getCurrentTask() | 1 ); + g_network->onMainThread( std::move(signal), incrementPriorityIfEven( g_network->getCurrentTask() ) ); } private: Promise promise; @@ -106,4 +106,4 @@ private: Reference createGenericThreadPool(); -#endif \ No newline at end of file +#endif diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index eea95ab827..aaa2d829f3 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -100,9 +100,9 @@ public: struct OrderedTask { int64_t priority; - int taskID; + TaskPriority taskID; Task *task; - OrderedTask(int64_t priority, int taskID, Task* task) : priority(priority), taskID(taskID), task(task) {} + OrderedTask(int64_t priority, TaskPriority taskID, Task* task) : priority(priority), taskID(taskID), task(task) {} bool operator < (OrderedTask const& rhs) const { return priority < rhs.priority; } }; @@ -122,12 +122,12 @@ public: // INetwork interface virtual double now() { return currentTime; }; - virtual Future delay( double seconds, int taskId ); - virtual Future yield( int taskID ); - virtual bool check_yield(int taskId); - virtual int getCurrentTask() { return currentTaskID; } - virtual void setCurrentTask(int taskID ) { priorityMetric = currentTaskID = taskID; } - virtual void onMainThread( Promise&& signal, int taskID ); + virtual Future delay( double seconds, TaskPriority taskId ); + virtual Future yield( TaskPriority taskID ); + virtual bool check_yield(TaskPriority taskId); + virtual TaskPriority getCurrentTask() { return currentTaskID; } + virtual void setCurrentTask(TaskPriority taskID ) { currentTaskID = taskID; priorityMetric = (int64_t)taskID; } + virtual void onMainThread( Promise&& signal, TaskPriority taskID ); bool isOnMainThread() const override { return thread_network == this; } @@ -160,7 +160,7 @@ public: int64_t tsc_begin, tsc_end; double taskBegin; - int currentTaskID; + TaskPriority currentTaskID; uint64_t tasksIssued; TDMetricCollection tdmetrics; double currentTime; @@ -170,7 +170,7 @@ public: uint64_t numYields; double lastPriorityTrackTime; - int lastMinTaskID; + TaskPriority lastMinTaskID; double priorityTimer[NetworkMetrics::PRIORITY_BINS]; std::priority_queue> ready; @@ -178,15 +178,15 @@ public: struct DelayedTask : OrderedTask { double at; - DelayedTask(double at, int64_t priority, int taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {} + DelayedTask(double at, int64_t priority, TaskPriority taskID, Task* task) : at(at), OrderedTask(priority, taskID, task) {} bool operator < (DelayedTask const& rhs) const { return at > rhs.at; } // Ordering is reversed for priority_queue }; std::priority_queue> timers; - void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority); - bool check_yield(int taskId, bool isRunLoop); + void checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority); + bool check_yield(TaskPriority taskId, bool isRunLoop); void processThreadReady(); - void trackMinPriority( int minTaskID, double now ); + void trackMinPriority( TaskPriority minTaskID, double now ); void stopImmediately() { stopped=true; decltype(ready) _1; ready.swap(_1); decltype(timers) _2; timers.swap(_2); } @@ -492,8 +492,8 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer) stopped(false), tasksIssued(0), // Until run() is called, yield() will always yield - tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskDefaultYield), - lastMinTaskID(0), + tsc_begin(0), tsc_end(0), taskBegin(0), currentTaskID(TaskPriority::DefaultYield), + lastMinTaskID(TaskPriority::Zero), numYields(0) { TraceEvent("Net2Starting"); @@ -514,7 +514,7 @@ Net2::Net2(bool useThreadPool, bool useMetrics, bool useObjectSerializer) int priBins[] = { 1, 2050, 3050, 4050, 4950, 5050, 7050, 8050, 10050 }; static_assert( sizeof(priBins) == sizeof(int)*NetworkMetrics::PRIORITY_BINS, "Fix priority bins"); for(int i=0; i(priBins[i]); updateNow(); } @@ -582,7 +582,7 @@ void Net2::run() { tsc_begin = __rdtsc(); taskBegin = timer_monotonic(); runFunc(); - checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskRunCycleFunction); + checkForSlowTask(tsc_begin, __rdtsc(), timer_monotonic() - taskBegin, TaskPriority::RunCycleFunction); } double sleepTime = 0; @@ -610,7 +610,7 @@ void Net2::run() { if ((now-nnow) > FLOW_KNOBS->SLOW_LOOP_CUTOFF && nondeterministicRandom()->random01() < (now-nnow)*FLOW_KNOBS->SLOW_LOOP_SAMPLING_RATE) TraceEvent("SomewhatSlowRunLoopTop").detail("Elapsed", now - nnow); - if (sleepTime) trackMinPriority( 0, now ); + if (sleepTime) trackMinPriority( TaskPriority::Zero, now ); while (!timers.empty() && timers.top().at < now) { ++countTimers; ready.push( timers.top() ); @@ -623,12 +623,12 @@ void Net2::run() { tsc_end = tsc_begin + FLOW_KNOBS->TSC_YIELD_TIME; taskBegin = timer_monotonic(); numYields = 0; - int minTaskID = TaskMaxPriority; + TaskPriority minTaskID = TaskPriority::Max; while (!ready.empty()) { ++countTasks; currentTaskID = ready.top().taskID; - priorityMetric = currentTaskID; + priorityMetric = static_cast(currentTaskID); minTaskID = std::min(minTaskID, currentTaskID); Task* task = ready.top().task; ready.pop(); @@ -641,7 +641,7 @@ void Net2::run() { TraceEvent(SevError, "TaskError").error(unknown_error()); } - if (check_yield(TaskMaxPriority, true)) { ++countYields; break; } + if (check_yield(TaskPriority::Max, true)) { ++countYields; break; } } nnow = timer_monotonic(); @@ -700,10 +700,10 @@ void Net2::run() { #endif } -void Net2::trackMinPriority( int minTaskID, double now ) { +void Net2::trackMinPriority( TaskPriority minTaskID, double now ) { if (minTaskID != lastMinTaskID) for(int c=0; c= minTaskID && pri < lastMinTaskID) { // busy -> idle double busyFor = lastPriorityTrackTime - priorityTimer[c]; networkMetrics.secSquaredPriorityBlocked[c] += busyFor*busyFor; @@ -726,7 +726,7 @@ void Net2::processThreadReady() { } } -void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, int64_t priority) { +void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, TaskPriority priority) { int64_t elapsed = tscEnd-tscBegin; if (elapsed > FLOW_KNOBS->TSC_YIELD_TIME && tscBegin > 0) { int i = std::min(NetworkMetrics::SLOW_EVENT_BINS-1, log( elapsed/1e6 ) / log(2.)); @@ -737,7 +737,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i slowTaskMetric->clocks = elapsed; slowTaskMetric->duration = (int64_t)(duration*1e9); - slowTaskMetric->priority = priority; + slowTaskMetric->priority = static_cast(priority); slowTaskMetric->numYields = numYields; slowTaskMetric->log(); @@ -751,7 +751,7 @@ void Net2::checkForSlowTask(int64_t tscBegin, int64_t tscEnd, double duration, i } } -bool Net2::check_yield( int taskID, bool isRunLoop ) { +bool Net2::check_yield( TaskPriority taskID, bool isRunLoop ) { if(!isRunLoop && numYields > 0) { ++numYields; return true; @@ -764,8 +764,8 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) { processThreadReady(); - if (taskID == TaskDefaultYield) taskID = currentTaskID; - if (!ready.empty() && ready.top().priority > (int64_t(taskID)<<32)) { + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; + if (!ready.empty() && ready.top().priority > int64_t(taskID)<<32) { return true; } @@ -790,13 +790,13 @@ bool Net2::check_yield( int taskID, bool isRunLoop ) { return false; } -bool Net2::check_yield( int taskID ) { +bool Net2::check_yield( TaskPriority taskID ) { return check_yield(taskID, false); } -Future Net2::yield( int taskID ) { +Future Net2::yield( TaskPriority taskID ) { ++countYieldCalls; - if (taskID == TaskDefaultYield) taskID = currentTaskID; + if (taskID == TaskPriority::DefaultYield) taskID = currentTaskID; if (check_yield(taskID, false)) { ++countYieldCallsTrue; return delay(0, taskID); @@ -805,7 +805,7 @@ Future Net2::yield( int taskID ) { return Void(); } -Future Net2::delay( double seconds, int taskId ) { +Future Net2::delay( double seconds, TaskPriority taskId ) { if (seconds <= 0.) { PromiseTask* t = new PromiseTask; this->ready.push( OrderedTask( (int64_t(taskId)<<32)-(++tasksIssued), taskId, t) ); @@ -820,7 +820,7 @@ Future Net2::delay( double seconds, int taskId ) { return t->promise.getFuture(); } -void Net2::onMainThread(Promise&& signal, int taskID) { +void Net2::onMainThread(Promise&& signal, TaskPriority taskID) { if (stopped) return; PromiseTask* p = new PromiseTask( std::move(signal) ); int64_t priority = int64_t(taskID)<<32; diff --git a/flow/ObjectSerializerTraits.h b/flow/ObjectSerializerTraits.h index 3301214e76..37b8b2ece3 100644 --- a/flow/ObjectSerializerTraits.h +++ b/flow/ObjectSerializerTraits.h @@ -62,42 +62,15 @@ struct index_impl<0, pack> { template using index_t = typename index_impl::type; -// A smart pointer that knows whether or not to delete itself. -template -using OwnershipErasedPtr = std::unique_ptr>; - -// Creates an OwnershipErasedPtr that will delete itself. -template > -OwnershipErasedPtr ownedPtr(T* t, Deleter&& d = Deleter{}) { - return OwnershipErasedPtr{ t, std::forward(d) }; -} - -// Creates an OwnershipErasedPtr that will not delete itself. -template -OwnershipErasedPtr unownedPtr(T* t) { - return OwnershipErasedPtr{ t, [](T*) {} }; -} - -struct WriteRawMemory { - using Block = std::pair, size_t>; - std::vector blocks; - - WriteRawMemory() {} - WriteRawMemory(Block&& b) { blocks.emplace_back(std::move(b.first), b.second); } - WriteRawMemory(std::vector&& v) : blocks(std::move(v)) {} - - WriteRawMemory(WriteRawMemory&&) = default; - WriteRawMemory& operator=(WriteRawMemory&&) = default; - - size_t size() const { - size_t result = 0; - for (const auto& b : blocks) { - result += b.second; - } - return result; - } +struct Block { + const uint8_t* data; + size_t size; }; +template +Block unownedPtr(T* t, size_t s) { + return Block{ t, s }; +} template struct scalar_traits : std::false_type { @@ -113,7 +86,8 @@ struct scalar_traits : std::false_type { template struct dynamic_size_traits : std::false_type { - static WriteRawMemory save(const T&); + static Block save(const T&); + static void serialization_done(const T&); // Optional. Called after the last call to save. // Context is an arbitrary type that is plumbed by reference throughout the // load call tree. @@ -140,7 +114,6 @@ struct vector_like_traits : std::false_type { static insert_iterator insert(VectorLike&); static iterator begin(const VectorLike&); - static void deserialization_done(VectorLike&); // Optional }; template diff --git a/flow/Profiler.actor.cpp b/flow/Profiler.actor.cpp index ef63f13c17..87befe9bb7 100644 --- a/flow/Profiler.actor.cpp +++ b/flow/Profiler.actor.cpp @@ -248,7 +248,7 @@ struct Profiler { outOffset += self->environmentInfoWriter.getLength(); loop { - wait( self->network->delay(1.0, TaskMinPriority) || self->network->delay(2.0, TaskMaxPriority) ); + wait( self->network->delay(1.0, TaskPriority::Min) || self->network->delay(2.0, TaskPriority::Max) ); self->enableSignal(false); std::swap( self->output_buffer, otherBuffer ); diff --git a/flow/ThreadHelper.actor.h b/flow/ThreadHelper.actor.h index 4fdd3c26ff..ed6a9cdc7d 100644 --- a/flow/ThreadHelper.actor.h +++ b/flow/ThreadHelper.actor.h @@ -35,11 +35,11 @@ // void onMainThreadVoid( F f ) { // Promise signal; // doOnMainThreadVoid( signal.getFuture(), f ); -// g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread ); +// g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread ); // } template -void onMainThreadVoid( F f, Error* err, int taskID = TaskDefaultOnMainThread ) { +void onMainThreadVoid( F f, Error* err, TaskPriority taskID = TaskPriority::DefaultOnMainThread ) { Promise signal; doOnMainThreadVoid( signal.getFuture(), f, err ); g_network->onMainThread( std::move(signal), taskID ); @@ -585,7 +585,7 @@ template ThreadFuture< decltype(fake()().getValue()) > onMainThread returnValue->addref(); // For the ThreadFuture we return Future cancelFuture = doOnMainThread()().getValue()), F>( signal.getFuture(), f, returnValue ); returnValue->setCancel( std::move(cancelFuture) ); - g_network->onMainThread( std::move(signal), TaskDefaultOnMainThread ); + g_network->onMainThread( std::move(signal), TaskPriority::DefaultOnMainThread ); return ThreadFuture()().getValue())>( returnValue ); } diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 45fcce8d2e..4e70a5d29b 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -630,7 +630,7 @@ void openTraceFile(const NetworkAddress& na, uint64_t rollsize, uint64_t maxLogs std::string baseName = format("%s.%s.%d", baseOfBase.c_str(), ip.c_str(), na.port); g_traceLog.open( directory, baseName, logGroup, format("%lld", time(NULL)), rollsize, maxLogsSize, !g_network->isSimulated() ? na : Optional()); - uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskFlushTrace)); + uncancellable(recurring(&flushTraceFile, FLOW_KNOBS->TRACE_FLUSH_INTERVAL, TaskPriority::FlushTrace)); g_traceBatch.dump(); } diff --git a/flow/flat_buffers.cpp b/flow/flat_buffers.cpp index d4c1ddcdf2..6c6c442e52 100644 --- a/flow/flat_buffers.cpp +++ b/flow/flat_buffers.cpp @@ -34,17 +34,16 @@ bool TraverseMessageTypes::vtableGeneratedBefore(const std::type_index& idx) { return !f.known_types.insert(idx).second; } -VTable generate_vtable(size_t numMembers, const std::vector& members, - const std::vector& alignments) { +VTable generate_vtable(size_t numMembers, const std::vector& sizesAlignments) { if (numMembers == 0) { return VTable{ 4, 4 }; } // first is index, second is size std::vector> indexed; - indexed.reserve(members.size()); - for (unsigned i = 0; i < members.size(); ++i) { - if (members[i] > 0) { - indexed.emplace_back(i, members[i]); + indexed.reserve(numMembers); + for (unsigned i = 0; i < numMembers; ++i) { + if (sizesAlignments[i] > 0) { + indexed.emplace_back(i, sizesAlignments[i]); } } std::stable_sort(indexed.begin(), indexed.end(), @@ -52,15 +51,15 @@ VTable generate_vtable(size_t numMembers, const std::vector& members, return lhs.second > rhs.second; }); VTable result; - result.resize(members.size() + 2); + result.resize(numMembers + 2); // size of the vtable is // - 2 bytes per member + // - 2 bytes for the size entry + // - 2 bytes for the size of the object - result[0] = 2 * members.size() + 4; + result[0] = 2 * numMembers + 4; int offset = 0; for (auto p : indexed) { - auto align = alignments[p.first]; + auto align = sizesAlignments[numMembers + p.first]; auto& res = result[p.first + 2]; res = offset % align == 0 ? offset : ((offset / align) + 1) * align; offset = res + p.second; @@ -78,8 +77,10 @@ TEST_CASE("flow/FlatBuffers/test") { auto* vtable1 = detail::get_vtable(); auto* vtable2 = detail::get_vtable(); auto* vtable3 = detail::get_vtable(); + auto* vtable4 = detail::get_vtable(); ASSERT(vtable1 != vtable2); ASSERT(vtable2 == vtable3); + ASSERT(vtable1 == vtable4); // Different types, but same vtable! Saves space in encoded messages ASSERT(vtable1->size() == 3); ASSERT(vtable2->size() == 7); ASSERT((*vtable2)[0] == 14); @@ -166,7 +167,6 @@ TEST_CASE("flow/FlatBuffers/collectVTables") { Root root; const auto* vtables = detail::get_vtableset(root); ASSERT(vtables == detail::get_vtableset(root)); - ASSERT(vtables->offsets.size() == 3); const auto& root_vtable = *detail::get_vtable, Nested>(); const auto& nested_vtable = *detail::get_vtable, int>(); int root_offset = vtables->offsets.at(&root_vtable); @@ -329,51 +329,10 @@ TEST_CASE("flow/FlatBuffers/vectorBool") { return Void(); } -struct DynamicSizeThingy { - std::string x; - mutable int saves = 0; -}; - } // namespace unit_tests -template <> -struct dynamic_size_traits : std::true_type { -private: - using T = unit_tests::DynamicSizeThingy; - -public: - static WriteRawMemory save(const T& t) { - ++t.saves; - T* t2 = new T(t); - return { { ownedPtr(reinterpret_cast(t2->x.data()), [t2](auto*) { delete t2; }), - t2->x.size() } }; - } - - // Context is an arbitrary type that is plumbed by reference throughout the - // load call tree. - template - static void load(const uint8_t* p, size_t n, T& t, Context&) { - t.x.assign(reinterpret_cast(p), n); - } -}; - namespace unit_tests { -TEST_CASE("flow/FlatBuffers/dynamic_size_owned") { - DynamicSizeThingy x1 = { "abcdefg" }; - DynamicSizeThingy x2; - Arena arena; - DummyContext context; - const uint8_t* out; - - out = save_members(arena, FileIdentifier{}, x1); - ASSERT(x1.saves == 1); - // print_buffer(out, arena.get_size(out)); - load_members(out, context, x2); - ASSERT(x1.x == x2.x); - return Void(); -} - struct Y1 { int a; diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h index a7ff261358..420fa9f83a 100644 --- a/flow/flat_buffers.h +++ b/flow/flat_buffers.h @@ -174,9 +174,7 @@ private: using T = std::string; public: - static WriteRawMemory save(const T& t) { - return { { unownedPtr(reinterpret_cast(t.data())), t.size() } }; - }; + static Block save(const T& t) { return unownedPtr(reinterpret_cast(t.data()), t.size()); }; // Context is an arbitrary type that is plumbed by reference throughout the // load call tree. @@ -233,13 +231,15 @@ template struct sfinae_true : std::true_type {}; template -auto test_deserialization_done(int) -> sfinae_true; +auto test_serialization_done(int) -> sfinae_true; template -auto test_deserialization_done(long) -> std::false_type; +auto test_serialization_done(long) -> std::false_type; +// int is a better match for 0 than long. If substituting T::serialization_done succeeds the true_type overload is +// selected. template -struct has_deserialization_done : decltype(test_deserialization_done(0)) {}; +struct has_serialization_done : decltype(test_serialization_done(0)) {}; template constexpr int fb_scalar_size = is_scalar ? scalar_traits::size : sizeof(RelativeOffset); @@ -324,19 +324,6 @@ struct PrecomputeSize { // offset. void write(const void*, int offset, int len) { current_buffer_size = std::max(current_buffer_size, offset); } - template - void writeRawMemory(ToRawMemory&& to_raw_memory) { - auto w = std::forward(to_raw_memory)(); - int start = RightAlign(current_buffer_size + w.size() + 4, 4); - write(nullptr, start, 4); - start -= 4; - for (auto& block : w.blocks) { - write(nullptr, start, block.second); - start -= block.second; - } - writeRawMemories.emplace_back(std::move(w)); - } - struct Noop { void write(const void* src, int offset, int len) {} void writeTo(PrecomputeSize& writer, int offset) { @@ -355,12 +342,13 @@ struct PrecomputeSize { return Noop{ size, writeToIndex }; } + static constexpr bool finalPass = false; + int current_buffer_size = 0; const int buffer_length = -1; // Dummy, the value of this should not affect anything. const int vtable_start = -1; // Dummy, the value of this should not affect anything. std::vector writeToOffsets; - std::vector writeRawMemories; }; template @@ -382,26 +370,9 @@ struct WriteToBuffer { current_buffer_size = std::max(current_buffer_size, offset); } - template - void writeRawMemory(ToRawMemory&&) { - auto& w = *write_raw_memories_iter; - uint32_t size = w.size(); - int start = RightAlign(current_buffer_size + size + 4, 4); - write(&size, start, 4); - start -= 4; - for (auto& p : w.blocks) { - if (p.second > 0) { - write(reinterpret_cast(p.first.get()), start, p.second); - } - start -= p.second; - } - ++write_raw_memories_iter; - } - - WriteToBuffer(int buffer_length, int vtable_start, uint8_t* buffer, std::vector writeToOffsets, - std::vector::iterator write_raw_memories_iter) + WriteToBuffer(int buffer_length, int vtable_start, uint8_t* buffer, std::vector writeToOffsets) : buffer_length(buffer_length), vtable_start(vtable_start), buffer(buffer), - writeToOffsets(std::move(writeToOffsets)), write_raw_memories_iter(write_raw_memories_iter) {} + writeToOffsets(std::move(writeToOffsets)) {} struct MessageWriter { template @@ -433,12 +404,13 @@ struct WriteToBuffer { const int vtable_start; int current_buffer_size = 0; + static constexpr bool finalPass = true; + private: void copy_memory(const void* src, int offset, int len) { memcpy(static_cast(&buffer[buffer_length - offset]), src, len); } std::vector writeToOffsets; - std::vector::iterator write_raw_memories_iter; int writeToIndex = 0; uint8_t* buffer; }; @@ -459,24 +431,28 @@ constexpr auto fields_helper() { template using Fields = decltype(fields_helper()); -// TODO(anoyes): Make this `template ` so we can re-use -// identical vtables even if they have different types. -// Also, it's important that get_vtable always returns the same VTable pointer +// It's important that get_vtable always returns the same VTable pointer // so that we can decide equality by comparing the pointers. -extern VTable generate_vtable(size_t numMembers, const std::vector& members, - const std::vector& alignments); +// First |numMembers| elements of sizesAndAlignments are sizes, the second +// |numMembers| elements are alignments. +extern VTable generate_vtable(size_t numMembers, const std::vector& sizesAndAlignments); + +template +const VTable* gen_vtable3() { + static VTable table = + generate_vtable(sizeof...(MembersAndAlignments) / 2, std::vector{ MembersAndAlignments... }); + return &table; +} template -VTable gen_vtable(pack p) { - return generate_vtable(sizeof...(Members), std::vector{ { _SizeOf::size... } }, - std::vector{ { _SizeOf::align... } }); +const VTable* gen_vtable2(pack p) { + return gen_vtable3<_SizeOf::size..., _SizeOf::align...>(); } template const VTable* get_vtable() { - static VTable table = gen_vtable(concat_t...>{}); - return &table; + return gen_vtable2(concat_t...>{}); } template @@ -542,6 +518,7 @@ private: struct InsertVTableLambda { static constexpr bool isDeserializing = false; + static constexpr bool isSerializing = false; static constexpr bool is_fb_visitor = true; std::set& vtables; std::set& known_types; @@ -665,6 +642,7 @@ private: template struct SaveVisitorLambda { static constexpr bool isDeserializing = false; + static constexpr bool isSerializing = true; static constexpr bool is_fb_visitor = true; const VTableSet* vtableset; Writer& writer; @@ -738,6 +716,7 @@ struct SaveVisitorLambda { template struct LoadMember { static constexpr bool isDeserializing = true; + static constexpr bool isSerializing = false; const uint16_t* const vtable; const uint8_t* const message; const uint16_t vtable_length; @@ -774,9 +753,6 @@ struct LoadMember { ++inserter; current += sizeof(RelativeOffset); } - if constexpr (has_deserialization_done::value) { - VectorTraits::deserialization_done(member); - } } else if constexpr (is_union_like) { if (!field_present()) { i += 2; @@ -852,6 +828,7 @@ struct LoadSaveHelper { template struct SerializeFun { static constexpr bool isDeserializing = true; + static constexpr bool isSerializing = false; static constexpr bool is_fb_visitor = true; const uint16_t* vtable; @@ -901,9 +878,6 @@ struct LoadSaveHelper { ++inserter; current += fb_size; } - if constexpr (has_deserialization_done::value) { - VectorTraits::deserialization_done(member); - } } template >> @@ -934,7 +908,15 @@ struct LoadSaveHelper { template >> RelativeOffset save(const U& message, Writer& writer, const VTableSet*, std::enable_if_t, int> _ = 0) { - writer.writeRawMemory([&]() { return dynamic_size_traits::save(message); }); + auto block = dynamic_size_traits::save(message); + uint32_t size = block.size; + int start = RightAlign(writer.current_buffer_size + size + 4, 4); + writer.write(&size, start, 4); + start -= 4; + writer.write(block.data, start, block.size); + if constexpr (has_serialization_done>::value && Writer::finalPass) { + dynamic_size_traits::serialization_done(message); + } return RelativeOffset{ writer.current_buffer_size }; } @@ -1050,7 +1032,7 @@ uint8_t* save(Allocator& allocator, const Root& root, FileIdentifier file_identi uint8_t* out = allocator(precompute_size.current_buffer_size); memset(out, 0, precompute_size.current_buffer_size); WriteToBuffer writeToBuffer{ precompute_size.current_buffer_size, vtable_start, out, - std::move(precompute_size.writeToOffsets), precompute_size.writeRawMemories.begin() }; + std::move(precompute_size.writeToOffsets) }; save_with_vtables(root, vtableset, writeToBuffer, &vtable_start, file_identifier); return out; } diff --git a/flow/flow.h b/flow/flow.h index 7ce23eade7..53f35516eb 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -817,7 +817,7 @@ public: return getReplyPromise(value).getFuture(); } template - Future getReply(const X& value, int taskID) const { + Future getReply(const X& value, TaskPriority taskID) const { setReplyPriority(value, taskID); return getReplyPromise(value).getFuture(); } @@ -827,7 +827,7 @@ public: return getReply(Promise()); } template - Future getReplyWithTaskID(int taskID) const { + Future getReplyWithTaskID(TaskPriority taskID) const { Promise reply; reply.getEndpoint(taskID); return getReply(reply); @@ -908,11 +908,11 @@ struct ActorSingleCallback : SingleCallback { } }; inline double now() { return g_network->now(); } -inline Future delay(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds, taskID); } -inline Future delayUntil(double time, int taskID = TaskDefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); } -inline Future delayJittered(double seconds, int taskID = TaskDefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); } -inline Future yield(int taskID = TaskDefaultYield) { return g_network->yield(taskID); } -inline bool check_yield(int taskID = TaskDefaultYield) { return g_network->check_yield(taskID); } +inline Future delay(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds, taskID); } +inline Future delayUntil(double time, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(std::max(0.0, time - g_network->now()), taskID); } +inline Future delayJittered(double seconds, TaskPriority taskID = TaskPriority::DefaultDelay) { return g_network->delay(seconds*(FLOW_KNOBS->DELAY_JITTER_OFFSET + FLOW_KNOBS->DELAY_JITTER_RANGE*deterministicRandom()->random01()), taskID); } +inline Future yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->yield(taskID); } +inline bool check_yield(TaskPriority taskID = TaskPriority::DefaultYield) { return g_network->check_yield(taskID); } #include "flow/genericactors.actor.h" #endif diff --git a/flow/genericactors.actor.h b/flow/genericactors.actor.h index 7b577b2e4c..fdf02a30d2 100644 --- a/flow/genericactors.actor.h +++ b/flow/genericactors.actor.h @@ -183,7 +183,7 @@ Future waitForAllReady( std::vector> results ) { } ACTOR template -Future timeout( Future what, double time, T timedoutValue, int taskID = TaskDefaultDelay ) { +Future timeout( Future what, double time, T timedoutValue, TaskPriority taskID = TaskPriority::DefaultDelay ) { Future end = delay( time, taskID ); choose { when( T t = wait( what ) ) { return t; } @@ -201,7 +201,7 @@ Future> timeout( Future what, double time ) { } ACTOR template -Future timeoutError( Future what, double time, int taskID = TaskDefaultDelay ) { +Future timeoutError( Future what, double time, TaskPriority taskID = TaskPriority::DefaultDelay ) { Future end = delay( time, taskID ); choose { when( T t = wait( what ) ) { return t; } @@ -210,7 +210,7 @@ Future timeoutError( Future what, double time, int taskID = TaskDefaultDel } ACTOR template -Future delayed( Future what, double time = 0.0, int taskID = TaskDefaultDelay ) { +Future delayed( Future what, double time = 0.0, TaskPriority taskID = TaskPriority::DefaultDelay ) { try { state T t = wait( what ); wait( delay( time, taskID ) ); @@ -223,7 +223,7 @@ Future delayed( Future what, double time = 0.0, int taskID = TaskDefaultDe } ACTOR template -Future recurring( Func what, double interval, int taskID = TaskDefaultDelay ) { +Future recurring( Func what, double interval, TaskPriority taskID = TaskPriority::DefaultDelay ) { loop choose { when ( wait( delay( interval, taskID ) ) ) { what(); } } @@ -951,7 +951,7 @@ Future quorum(std::vector> const& results, int n) { } ACTOR template -Future smartQuorum( std::vector> results, int required, double extraSeconds, int taskID = TaskDefaultDelay ) { +Future smartQuorum( std::vector> results, int required, double extraSeconds, TaskPriority taskID = TaskPriority::DefaultDelay ) { if (results.empty() && required == 0) return Void(); wait(quorum(results, required)); choose { @@ -1259,7 +1259,7 @@ struct FlowLock : NonCopyable, public ReferenceCounted { FlowLock() : permits(1), active(0) {} explicit FlowLock(int64_t permits) : permits(permits), active(0) {} - Future take(int taskID = TaskDefaultYield, int64_t amount = 1) { + Future take(TaskPriority taskID = TaskPriority::DefaultYield, int64_t amount = 1) { if (active + amount <= permits || active == 0) { active += amount; return safeYieldActor(this, taskID, amount); @@ -1298,7 +1298,7 @@ private: int64_t active; Promise broken_on_destruct; - ACTOR static Future takeActor(FlowLock* lock, int taskID, int64_t amount) { + ACTOR static Future takeActor(FlowLock* lock, TaskPriority taskID, int64_t amount) { state std::list, int64_t>>::iterator it = lock->takers.insert(lock->takers.end(), std::make_pair(Promise(), amount)); try { @@ -1330,7 +1330,7 @@ private: return Void(); } - ACTOR static Future safeYieldActor(FlowLock* lock, int taskID, int64_t amount) { + ACTOR static Future safeYieldActor(FlowLock* lock, TaskPriority taskID, int64_t amount) { try { choose{ when(wait(yield(taskID))) {} @@ -1351,7 +1351,7 @@ private: }; ACTOR template -Future yieldPromiseStream( FutureStream input, PromiseStream output, int taskID = TaskDefaultYield ) { +Future yieldPromiseStream( FutureStream input, PromiseStream output, TaskPriority taskID = TaskPriority::DefaultYield ) { loop { T f = waitNext( input ); output.send( f ); diff --git a/flow/network.h b/flow/network.h index 02532ba6ea..494d238a2e 100644 --- a/flow/network.h +++ b/flow/network.h @@ -29,57 +29,71 @@ #include "boost/asio.hpp" #include "flow/serialize.h" #include "flow/IRandom.h" -#include "fdbrpc/crc32c.h" -enum { - TaskMaxPriority = 1000000, - TaskRunCycleFunction = 20000, - TaskFlushTrace = 10500, - TaskWriteSocket = 10000, - TaskPollEIO = 9900, - TaskDiskIOComplete = 9150, - TaskLoadBalancedEndpoint = 9000, - TaskReadSocket = 9000, - TaskCoordinationReply = 8810, - TaskCoordination = 8800, - TaskFailureMonitor = 8700, - TaskResolutionMetrics = 8700, - TaskClusterController = 8650, - TaskProxyStorageRejoin = 8645, - TaskProxyCommitDispatcher = 8640, - TaskTLogQueuingMetrics = 8620, - TaskTLogPop = 8610, - TaskTLogPeekReply = 8600, - TaskTLogPeek = 8590, - TaskTLogCommitReply = 8580, - TaskTLogCommit = 8570, - TaskProxyGetRawCommittedVersion = 8565, - TaskProxyResolverReply = 8560, - TaskProxyCommitBatcher = 8550, - TaskProxyCommit = 8540, - TaskTLogConfirmRunningReply = 8530, - TaskTLogConfirmRunning = 8520, - TaskProxyGRVTimer = 8510, - TaskProxyGetConsistentReadVersion = 8500, - TaskDefaultPromiseEndpoint = 8000, - TaskDefaultOnMainThread = 7500, - TaskDefaultDelay = 7010, - TaskDefaultYield = 7000, - TaskDiskWrite = 5030, - TaskStorage = 5020, - TaskDiskRead = 5010, - TaskDefaultEndpoint = 5000, - TaskUnknownEndpoint = 4000, - TaskMoveKeys = 3550, - TaskDataDistributionLaunch = 3530, - TaskRatekeeper = 3510, - TaskDataDistribution = 3500, - TaskUpdateStorage = 3000, - TaskTLogSpilledPeekReply = 2800, - TaskLowPriority = 2000, - TaskMinPriority = 1000 +enum class TaskPriority { + Max = 1000000, + RunCycleFunction = 20000, + FlushTrace = 10500, + WriteSocket = 10000, + PollEIO = 9900, + DiskIOComplete = 9150, + LoadBalancedEndpoint = 9000, + ReadSocket = 9000, + CoordinationReply = 8810, + Coordination = 8800, + FailureMonitor = 8700, + ResolutionMetrics = 8700, + ClusterController = 8650, + ProxyStorageRejoin = 8645, + ProxyCommitDispatcher = 8640, + TLogQueuingMetrics = 8620, + TLogPop = 8610, + TLogPeekReply = 8600, + TLogPeek = 8590, + TLogCommitReply = 8580, + TLogCommit = 8570, + ProxyGetRawCommittedVersion = 8565, + ProxyResolverReply = 8560, + ProxyCommitBatcher = 8550, + ProxyCommit = 8540, + TLogConfirmRunningReply = 8530, + TLogConfirmRunning = 8520, + ProxyGRVTimer = 8510, + ProxyGetConsistentReadVersion = 8500, + DefaultPromiseEndpoint = 8000, + DefaultOnMainThread = 7500, + DefaultDelay = 7010, + DefaultYield = 7000, + DiskRead = 5010, + DefaultEndpoint = 5000, + UnknownEndpoint = 4000, + MoveKeys = 3550, + DataDistributionLaunch = 3530, + Ratekeeper = 3510, + DataDistribution = 3500, + DiskWrite = 3010, + UpdateStorage = 3000, + TLogSpilledPeekReply = 2800, + Low = 2000, + + Min = 1000, + Zero = 0 }; +// These have been given long, annoying names to discourage their use. + +inline TaskPriority incrementPriority(TaskPriority p) { + return static_cast( static_cast(p) + 1 ); +} + +inline TaskPriority decrementPriority(TaskPriority p) { + return static_cast( static_cast(p) - 1 ); +} + +inline TaskPriority incrementPriorityIfEven(TaskPriority p) { + return static_cast( static_cast(p) | 1 ); +} + class Void; template class Optional; @@ -270,7 +284,7 @@ struct NetworkMetrics { uint64_t countSlowEvents[SLOW_EVENT_BINS]; enum { PRIORITY_BINS = 9 }; - int priorityBins[ PRIORITY_BINS ]; + TaskPriority priorityBins[ PRIORITY_BINS ]; double secSquaredPriorityBlocked[PRIORITY_BINS]; double oldestAlternativesFailure; @@ -372,19 +386,19 @@ public: // Provides a clock that advances at a similar rate on all connected endpoints // FIXME: Return a fixed point Time class - virtual Future delay( double seconds, int taskID ) = 0; + virtual Future delay( double seconds, TaskPriority taskID ) = 0; // The given future will be set after seconds have elapsed - virtual Future yield( int taskID ) = 0; + virtual Future yield( TaskPriority taskID ) = 0; // The given future will be set immediately or after higher-priority tasks have executed - virtual bool check_yield( int taskID ) = 0; + virtual bool check_yield( TaskPriority taskID ) = 0; // Returns true if a call to yield would result in a delay - virtual int getCurrentTask() = 0; + virtual TaskPriority getCurrentTask() = 0; // Gets the taskID/priority of the current task - virtual void setCurrentTask(int taskID ) = 0; + virtual void setCurrentTask(TaskPriority taskID ) = 0; // Sets the taskID/priority of the current task, without yielding virtual flowGlobalType global(int id) = 0; @@ -399,7 +413,7 @@ public: virtual bool isOnMainThread() const = 0; // Returns true if the current thread is the main thread - virtual void onMainThread( Promise&& signal, int taskID ) = 0; + virtual void onMainThread( Promise&& signal, TaskPriority taskID ) = 0; // Executes signal.send(Void()) on a/the thread belonging to this network virtual THREAD_HANDLE startThread( THREAD_FUNC_RETURN (*func) (void *), void *arg) = 0; diff --git a/flow/serialize.h b/flow/serialize.h index e7431e7205..1a3a916549 100644 --- a/flow/serialize.h +++ b/flow/serialize.h @@ -317,6 +317,7 @@ inline _Unversioned Unversioned() { return _Unversioned(); } class BinaryWriter : NonCopyable { public: static const int isDeserializing = 0; + static constexpr bool isSerializing = true; typedef BinaryWriter WRITER; void serializeBytes( StringRef bytes ) { @@ -518,6 +519,7 @@ private: class ArenaReader { public: static const int isDeserializing = 1; + static constexpr bool isSerializing = false; typedef ArenaReader READER; const void* readBytes( int bytes ) { @@ -583,6 +585,7 @@ private: class BinaryReader { public: static const int isDeserializing = 1; + static constexpr bool isSerializing = false; typedef BinaryReader READER; const void* readBytes( int bytes ); @@ -682,6 +685,7 @@ struct PacketBuffer : SendBuffer, FastAllocated { struct PacketWriter { static const int isDeserializing = 0; + static constexpr bool isSerializing = true; typedef PacketWriter WRITER; PacketBuffer* buffer; diff --git a/packaging/docker/Dockerfile b/packaging/docker/Dockerfile index 101ba295ab..dc514870f3 100644 --- a/packaging/docker/Dockerfile +++ b/packaging/docker/Dockerfile @@ -70,5 +70,6 @@ ENV FDB_PORT 4500 ENV FDB_CLUSTER_FILE /var/fdb/fdb.cluster ENV FDB_NETWORKING_MODE container ENV FDB_COORDINATOR "" +ENV FDB_COORDINATOR_PORT 4500 ENV FDB_CLUSTER_FILE_CONTENTS "" ENV FDB_PROCESS_CLASS unset diff --git a/packaging/docker/README.md b/packaging/docker/README.md index a8d6f48de8..39fc94844a 100644 --- a/packaging/docker/README.md +++ b/packaging/docker/README.md @@ -57,6 +57,13 @@ helpful when setting up a larger cluster inside a docker network, for instance when using Docker Compose. The name you provide must be resolvable through the DNS on the container you are running. +### FDB_COORDINATOR_PORT + +The port to use for connecting to the FDB coordinator process. This should be +set by other processes in a multi-process cluster to the same value as the +`FDB_PORT` environment variable of the coordinator process. It will default +to 4500, which is also the default for `FDB_PORT`. + # Copying Into Other Images You can also use this image to provide files for images that are clients of a @@ -68,4 +75,4 @@ files you may want to copy are: library, which you can use if you are setting up a multiversion client. * `/var/fdb/scripts/create_cluster_file.bash`: A script for setting up the cluster file based on an `FDB_COORDINATOR` environment variable. -* `/usr/bin/fdbcli`: The FoundationDB CLI. \ No newline at end of file +* `/usr/bin/fdbcli`: The FoundationDB CLI. diff --git a/packaging/docker/create_cluster_file.bash b/packaging/docker/create_cluster_file.bash index b701b03d1a..c1bb959b8e 100644 --- a/packaging/docker/create_cluster_file.bash +++ b/packaging/docker/create_cluster_file.bash @@ -39,7 +39,8 @@ function create_cluster_file() { echo "Failed to look up coordinator address for $FDB_COORDINATOR" 1>&2 exit 1 fi - echo "docker:docker@$coordinator_ip:4500" > $FDB_CLUSTER_FILE + coordinator_port=${FDB_COORDINATOR_PORT:-4500} + echo "docker:docker@$coordinator_ip:$coordinator_port" > $FDB_CLUSTER_FILE else echo "FDB_COORDINATOR environment variable not defined" 1>&2 exit 1 @@ -47,5 +48,5 @@ function create_cluster_file() { } if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - create_cluster_file "$@" -fi \ No newline at end of file + create_cluster_file "$@" +fi diff --git a/packaging/docker/create_server_environment.bash b/packaging/docker/create_server_environment.bash index 67979839b9..54d90f0854 100644 --- a/packaging/docker/create_server_environment.bash +++ b/packaging/docker/create_server_environment.bash @@ -43,4 +43,4 @@ function create_server_environment() { fi create_cluster_file -} \ No newline at end of file +} diff --git a/packaging/docker/fdb.bash b/packaging/docker/fdb.bash index 3fb322c431..3bf1c6a680 100644 --- a/packaging/docker/fdb.bash +++ b/packaging/docker/fdb.bash @@ -23,7 +23,7 @@ source /var/fdb/scripts/create_server_environment.bash create_server_environment source /var/fdb/.fdbenv -echo "Starting FDB server on $PUBLIC_IP:4500" -fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:4500 \ +echo "Starting FDB server on $PUBLIC_IP:$FDB_PORT" +fdbserver --listen_address 0.0.0.0:$FDB_PORT --public_address $PUBLIC_IP:$FDB_PORT \ --datadir /var/fdb/data --logdir /var/fdb/logs \ - --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS \ No newline at end of file + --locality_zoneid=`hostname` --locality_machineid=`hostname` --class $FDB_PROCESS_CLASS diff --git a/packaging/docker/samples/local/README.md b/packaging/docker/samples/local/README.md new file mode 100644 index 0000000000..f7f5b3e979 --- /dev/null +++ b/packaging/docker/samples/local/README.md @@ -0,0 +1,45 @@ +# Local Docker-based FoundationDB Cluster + +This contains a sample `docker-compose.yaml` and some simple startup and teardown +scripts for running a simple single-instance FoundationDB using the Docker image +specified in this repository. This uses the `host` networking option to expose +the server process to its host machine. + +This depends on having the FoundationDB client installed on your host machine +to work properly. This can be done using one of the client packages available +on our [Download](https://www.foundationdb.org/download/) page. The startup +scripts included here depend on `fdbcli` from one of those packages, and any +client that wishes to connect will need a copy of the FoundationDB native client +in addition to its binding of choice. Both the CLI and the native client +are installed in all of our client packages + +Once those dependencies are installed, one can build the FoundationDB Docker +image: + +``` +docker build --build-arg FDB_VERSION=6.1.8 -t foundationdb:6.1.8 ../.. +``` + +Then one can start the cluster by running: + +``` +./start.bash +``` + +This starts up a single instance FoundationDB cluster using the `docker-compose.yaml` +and configures it as a new database. This will write the cluster file information to +`docker.cluster`. One should then be able to access the cluster through the CLI +or one of the bindings by using this cluster file. For example: + +``` +fdbcli --exec status -C docker.cluster +``` + +To stop the cluster, one can run: + +``` +./stop.bash +``` + +Note that all data are lost between reboots of the processes as they have not +been configured to use a persistent volume (but write to Docker's temporary file system). diff --git a/packaging/docker/samples/local/docker-compose.yml b/packaging/docker/samples/local/docker-compose.yml new file mode 100644 index 0000000000..3ce177afb5 --- /dev/null +++ b/packaging/docker/samples/local/docker-compose.yml @@ -0,0 +1,32 @@ +# docker-compose.yaml +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Specification for a one node cluster than can be accessed from the host. +# The user must specify the FDB_PORT on which it is run. + +version: '3' +services: + fdb: + image: foundationdb:6.1.8 + ports: + - $FDB_PORT:$FDB_PORT/tcp + environment: + FDB_NETWORKING_MODE: host + FDB_COORDINATOR_PORT: $FDB_PORT + FDB_PORT: $FDB_PORT diff --git a/packaging/docker/samples/local/start.bash b/packaging/docker/samples/local/start.bash new file mode 100755 index 0000000000..64def42f51 --- /dev/null +++ b/packaging/docker/samples/local/start.bash @@ -0,0 +1,39 @@ +#! /bin/bash + +# +# start.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eu + +FDB_CLUSTER_FILE="${FDB_CLUSTER_FILE:-docker.cluster}" +FDB_PORT="${FDB_PORT:-4550}" + +FDB_PORT=$FDB_PORT docker-compose up -d fdb +echo "docker:docker@127.0.0.1:$FDB_PORT" > $FDB_CLUSTER_FILE + +# Attempt to connect. Configure the database if necessary. +if ! fdbcli -C $FDB_CLUSTER_FILE --exec status --timeout 1 ; then + if ! fdbcli -C $FDB_CLUSTER_FILE --exec "configure new single memory ; status" --timeout 10 ; then + echo "Unable to configure new FDB cluster." + exit 1 + fi +fi + +echo "Can now connect to docker-based FDB cluster using $FDB_CLUSTER_FILE." diff --git a/packaging/docker/samples/local/stop.bash b/packaging/docker/samples/local/stop.bash new file mode 100755 index 0000000000..55acc50953 --- /dev/null +++ b/packaging/docker/samples/local/stop.bash @@ -0,0 +1,28 @@ +#! /bin/bash + +# +# stop.bash +# +# This source file is part of the FoundationDB open source project +# +# Copyright 2013-2018 Apple Inc. and the FoundationDB project authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eu + +FDB_PORT="${FDB_PORT:-4550}" + +FDB_PORT=$FDB_PORT docker-compose down +echo "Docker-based FDB cluster is now down." diff --git a/packaging/docker/samples/python/app/Dockerfile b/packaging/docker/samples/python/app/Dockerfile index 8172f5aaea..7a3ed818a2 100644 --- a/packaging/docker/samples/python/app/Dockerfile +++ b/packaging/docker/samples/python/app/Dockerfile @@ -24,9 +24,9 @@ RUN apt-get update; apt-get install -y dnsutils RUN mkdir -p /app WORKDIR /app -COPY --from=foundationdb:5.2.5 /usr/lib/libfdb_c.so /usr/lib -COPY --from=foundationdb:5.2.5 /usr/bin/fdbcli /usr/bin/ -COPY --from=foundationdb:5.2.5 /var/fdb/scripts/create_cluster_file.bash /app +COPY --from=foundationdb:6.1.8 /usr/lib/libfdb_c.so /usr/lib +COPY --from=foundationdb:6.1.8 /usr/bin/fdbcli /usr/bin/ +COPY --from=foundationdb:6.1.8 /var/fdb/scripts/create_cluster_file.bash /app COPY requirements.txt /app RUN pip install -r requirements.txt @@ -38,4 +38,4 @@ RUN chmod u+x /app/start.bash CMD /app/start.bash ENV FLASK_APP=server.py -ENV FLASK_ENV=development \ No newline at end of file +ENV FLASK_ENV=development diff --git a/packaging/docker/samples/python/docker-compose.yml b/packaging/docker/samples/python/docker-compose.yml index 2280414688..e239bff80f 100644 --- a/packaging/docker/samples/python/docker-compose.yml +++ b/packaging/docker/samples/python/docker-compose.yml @@ -19,18 +19,33 @@ version: '3' services: - fdb: - image: foundationdb:5.2.5 - environment: - FDB_COORDINATOR: fdb-coordinator + # Specify three fdbserver processes. fdb-coordinator: - image: foundationdb:5.2.5 + image: foundationdb:6.1.8 environment: FDB_COORDINATOR: fdb-coordinator + fdb-server-1: + depends_on: + - fdb-coordinator + image: foundationdb:6.1.8 + environment: + FDB_COORDINATOR: fdb-coordinator + fdb-server-2: + depends_on: + - fdb-coordinator + image: foundationdb:6.1.8 + environment: + FDB_COORDINATOR: fdb-coordinator + + # Bring up the application so that it depends on the cluster. app: + depends_on: + - fdb-coordinator + - fdb-server-1 + - fdb-server-2 build: context: app ports: - - 5000:5000 + - 5000:5000/tcp environment: - FDB_COORDINATOR: fdb-coordinator \ No newline at end of file + FDB_COORDINATOR: fdb-coordinator diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index d81feaf3d7..486cd85ea0 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@