From 8dd08eef22829a2148cc7037c69f96bca322852f Mon Sep 17 00:00:00 2001 From: negoyal Date: Wed, 4 Mar 2020 16:51:05 -0800 Subject: [PATCH 01/27] Enable debugTransaction for all transactions. --- fdbclient/ReadYourWrites.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbclient/ReadYourWrites.actor.cpp b/fdbclient/ReadYourWrites.actor.cpp index 81f6f0e172..c27a9e50bc 100644 --- a/fdbclient/ReadYourWrites.actor.cpp +++ b/fdbclient/ReadYourWrites.actor.cpp @@ -1129,6 +1129,7 @@ ReadYourWritesTransaction::ReadYourWritesTransaction(Database const& cx) options(tr), deferredError(cx->deferredError) { std::copy(cx.getTransactionDefaults().begin(), cx.getTransactionDefaults().end(), std::back_inserter(persistentOptions)); + debugTransaction( deterministicRandom()->randomUniqueID() ); applyPersistentOptions(); } From ac289b811a7c37eec8854c60458687dbc39e9f52 Mon Sep 17 00:00:00 2001 From: negoyal Date: Mon, 9 Mar 2020 16:14:25 -0700 Subject: [PATCH 02/27] Don't print the debugTransaction messages to trace file. Only collect info. --- flow/Trace.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 26fc2eb0b6..3a5c84d107 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -1068,6 +1068,7 @@ void TraceBatch::dump() { machine = formatIpPort(local.ip, local.port); } + if (0) { for(int i = 0; i < attachBatch.size(); i++) { if(g_network->isSimulated()) { attachBatch[i].fields.addField("Machine", machine); @@ -1081,6 +1082,7 @@ void TraceBatch::dump() { } g_traceLog.writeEvent(eventBatch[i].fields, "", false); } + } for(int i = 0; i < buggifyBatch.size(); i++) { if(g_network->isSimulated()) { From 10b853694c281943d24b92dafedcdcf906f17f6c Mon Sep 17 00:00:00 2001 From: negoyal Date: Tue, 31 Mar 2020 17:38:00 -0700 Subject: [PATCH 03/27] Reverting the last change. --- flow/Trace.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 3b24bdaead..82e2d7f662 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -1214,7 +1214,6 @@ void TraceBatch::dump() { machine = formatIpPort(local.ip, local.port); } - if (0) { for(int i = 0; i < attachBatch.size(); i++) { if(g_network->isSimulated()) { attachBatch[i].fields.addField("Machine", machine); @@ -1228,7 +1227,6 @@ void TraceBatch::dump() { } g_traceLog.writeEvent(eventBatch[i].fields, "", false); } - } for(int i = 0; i < buggifyBatch.size(); i++) { if(g_network->isSimulated()) { From 3e3ac9f67d4d5559b83a75dc554be2b2cbf191c7 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 1 Apr 2020 14:41:16 -0700 Subject: [PATCH 04/27] Added interface --- flow/CMakeLists.txt | 1 + flow/FBTrace.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ flow/Trace.cpp | 5 +++++ 3 files changed, 54 insertions(+) create mode 100644 flow/FBTrace.h diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 27b61b768f..15677fa01d 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -20,6 +20,7 @@ set(FLOW_SRCS FastRef.h FaultInjection.cpp FaultInjection.h + FBTrace.h FileTraceLogWriter.cpp FileTraceLogWriter.h Hash3.c diff --git a/flow/FBTrace.h b/flow/FBTrace.h new file mode 100644 index 0000000000..092e1ec5ed --- /dev/null +++ b/flow/FBTrace.h @@ -0,0 +1,48 @@ +/* + * FBTrace.h + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "flow/FastRef.h" +#include "flow/ObjectSerializer.h" +#include + +class FBTraceImpl : public ReferenceCounted { +protected: + virtual void write(ObjectWriter& writer) = 0; +public: + virtual ~FBTraceImpl(); +}; + +template +class FDBTrace : FBTraceImpl { +protected: + void write(ObjectWriter& writer) override { + writer.serialize(*static_cast(this)); + } +}; + +void fbTrace(Reference const& traceLine); + +template +void fbTrace(Reference const& traceLine) { + static_assert(std::is_base_of::value, "fbTrace only accepts FBTraceImpl as argument"); + fbTrace(traceLine.template castTo()); +} diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 82e2d7f662..c57abfc670 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -20,6 +20,7 @@ #include "flow/Trace.h" +#include "flow/FBTrace.h" #include "flow/FileTraceLogWriter.h" #include "flow/XmlTraceLogFormatter.h" #include "flow/JsonTraceLogFormatter.h" @@ -1462,3 +1463,7 @@ std::string traceableStringToString(const char* value, size_t S) { return std::string(value, S - 1); // Exclude trailing \0 byte } + +FBTraceImpl::~FBTraceImpl() {} + +void fbTrace(Reference const& traceLine) {} From 6a899dd0d805bd73c3044cb9e0b590730c9711be Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 3 Apr 2020 09:24:23 -0700 Subject: [PATCH 05/27] added knob to turn on/off fbtraces --- flow/Knobs.cpp | 3 +++ flow/Knobs.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 392af42e29..f19dd0f2f2 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -164,6 +164,9 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( MAX_TRACE_EVENT_LENGTH, 4000 ); // If the value of this is changed, the corresponding default in Trace.cpp should be changed as well init( ALLOCATION_TRACING_ENABLED, true ); + // Flatbuffers Traces + init( FBTRACES_ENABLED, true ); + //TDMetrics init( MAX_METRICS, 600 ); init( MAX_METRIC_SIZE, 2500 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 0dcd52a983..ed141886e3 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -185,6 +185,9 @@ public: int MAX_TRACE_EVENT_LENGTH; bool ALLOCATION_TRACING_ENABLED; + // Flatbuffers Traces + bool FBTRACES_ENABLED; + //TDMetrics int64_t MAX_METRIC_SIZE; int64_t MAX_METRIC_LEVEL; From 740c5490b3f7741aeb5adb4f0d6d416ca61859c6 Mon Sep 17 00:00:00 2001 From: negoyal Date: Mon, 6 Apr 2020 16:00:40 -0700 Subject: [PATCH 06/27] Added flat buffer classes for debugTransaction trace message types. --- fdbserver/StorageCache.actor.cpp | 9 +- flow/FBTrace.h | 144 ++++++++++++++++++++++++++++++- 2 files changed, 150 insertions(+), 3 deletions(-) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 67516e291e..741f43ffa4 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -28,6 +28,7 @@ #include "fdbserver/LogSystem.h" #include "fdbserver/WaitFailure.h" #include "fdbserver/WorkerInterface.actor.h" +#include "flow/FBTrace.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -233,7 +234,7 @@ ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version ve ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { state int64_t resultSize = 0; - + state Reference getValueTrace; try { ++data->counters.getValueQueries; ++data->counters.allQueries; @@ -247,8 +248,12 @@ ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { //TODO what's this? wait( delay(0, TaskPriority::DefaultEndpoint) ); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + getValueTrace = GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ); + fbTrace(getValueTrace); + } state Optional v; state Version version = wait( waitForVersion( data, req.version ) ); diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 092e1ec5ed..69aae928d9 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -33,12 +33,154 @@ public: template class FDBTrace : FBTraceImpl { -protected: + protected: void write(ObjectWriter& writer) override { writer.serialize(*static_cast(this)); } }; +class GetValueDebugTrace : public FDBTrace { + constexpr static FileIdentifier file_identifier = 617894; +public: + enum codeLocation { + STORAGESERVER_GETVALUE_RECEIVED = 0, + STORAGESERVER_GETVALUE_DO_READ = 1, + STORAGESERVER_GETVALUE_AFTER_VERSION = 2, + STORAGESERVER_GETVALUE_AFTER_READ = 3, + STORAGECACHE_GETVALUE_RECEIVED = 4, + STORAGECACHE_GETVALUE_DO_READ = 5, + STORAGECACHE_GETVALUE_AFTER_VERSION = 6, + STORAGECACHE_GETVALUE_AFTER_READ = 7, + READER_GETVALUE_BEFORE = 8, + READER_GETVALUE_AFTER = 9, + READER_GETVALUEPREFIX_BEFORE = 10, + READER_GETVALUEPREFIX_AFTER = 11 + }; + + uint64_t id; + double time; + int32_t location; + + GetValueDebugTrace() {} + GetValueDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + template + void serialize(Ar& ar) { + serializer(ar, id, time, location); + } +}; + +class WatchValueDebugTrace : public FDBTrace { + constexpr static FileIdentifier file_identifier = 14486715; + enum codeLocation { + STORAGESERVER_WATCHVALUE_BEFORE = 1, + STORAGESERVER_WATCHVALUE_AFTER_VERSION = 2, + STORAGESERVER_WATCHVALUE_AFTER_READ = 3, + NATIVEAPI_WATCHVALUE_BEFORE = 4, + NATIVEAPI_WATCHVALUE_AFTER_READ = 5 + }; + + uint64_t id; + double time; + int32_t location; + + WatchValueDebugTrace() {} + WatchValueDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + template + void serialize(Ar& ar) { + serializer(ar, id, time, location); + } +}; + +class CommitDebugTrace : public FDBTrace { + constexpr static FileIdentifier file_identifier = 7691518; + enum codeLocation { + STORAGESERVER_COMMIT_BEORE = 0, + STORAGESERVER_COMMIT_AFTER_VERSION = 1, + STORAGESERVER_COMMIT_AFTER_READ = 2, + NATIVEAPI_COMMIT_BEORE = 3, + NATIVEAPI_COMMIT_AFTER = 4, + MASTERROXYSERVER_BATCHER = 5, + MASTERPROXYSERVER_COMMITBATCH_BEFORE = 6, + MASTERPROXYSERVER_COMMITBATCH_GETTINGCOMMITVERSION = 7, + MASTERPROXYSERVER_COMMITBATCH_GOTCOMMITVERSION = 8, + MASTERPROXYSERVER_COMMITBATCH_AFTERRESOLUTION = 9, + MASTERPROXYSERVER_COMMITBATCH_PROCESSINGMUTATIONS = 10, + MASTERPROXYSERVER_COMMITBATCH_AFTERSTORECOMMITS = 11, + MASTERPROXYSERVER_COMMITBATCH_AFTERLOGPUSH = 12, + RESOLVER_RESOLVEBATCH_BEFORE = 13, + RESOLVER_RESOLVEBATCH_AFTERQUEUESIZECHECK = 14, + RESOLVER_RESOLVEBATCH_AFTERORDERER = 15, + RESOLVER_RESOLVEBATCH_AFTER = 16, + TLOG_TLOGCOMMIT_BEFOREWAITFORVERSION = 17, + TLOG_TLOGCOMMIT_BEFORE = 18, + TLOG_TLOGCOMMIT_AFTERTLOGCOMMIT = 19, + TLOG_TLOGCOMMIT_AFTER = 20 + }; + + uint64_t id; + double time; + int32_t location; + + CommitDebugTrace() {} + CommitDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + template + void serialize(Ar& ar) { + serializer(ar, id, time, location); + } +}; + +class TransactionDebugTrace : public FDBTrace { + constexpr static FileIdentifier file_identifier = 6868728; + enum codeLocation { + STORAGESERVER_GETKEYVALUES_BEFORE = 0, + STORAGESERVER_GETKEYVALUES_AFTERVERSION = 1, + STORAGESERVER_GETKEYVALUES_AFTERKEYS = 2, + STORAGESERVER_GETKEYVALUES_SEND = 3, + STORAGESERVER_GETKEYVALUES_AFTERREADRANGE = 4, + NATIVEAPI_GETKEYLOCATION_BEFORE = 5, + NATIVEAPI_GETKEYLOCATION_AFTER = 6, + NATIVEAPI_GETKEYLOCATIONS_BEFORE = 7, + NATIVEAPI_GETKEYLOCATIONS_AFTER = 8, + NATIVEAPI_GETVALUE_BEFORE = 9, + NATIVEAPI_GETVALUE_AFTER = 10, + NATIVEAPI_GETVALUE_ERROR = 11, + NATIVEAPI_GETKEY_AFTERVERSION = 12, + NATIVEAPI_GETKEY_BEFORE = 13, + NATIVEAPI_GETKEY_AFTER = 14, + NATIVEAPI_GETEXACTRANGE_BEFORE = 15, + NATIVEAPI_GETEXACTRANGE_AFTER = 16, + NATIVEAPI_GETRANGE_BEFORE = 17, + NATIVEAPI_GETRANGE_AFTER = 18, + NATIVEAPI_GETRANGE_ERROR = 19, + NATIVEAPI_GETCONSISTENTREADVERSION_BEFORE = 20, + NATIVEAPI_GETCONSISTENTREADVERSION_AFTER = 21, + STORAGECACHE_GETKEYVALUES_BEFORE = 22, + STORAGECACHE_GETKEYVALUES_AFTERVERSION = 23, + STORAGECACHE_GETKEYVALUES_AFTERKEYS = 24, + STORAGECACHE_GETKEYVALUES_SEND = 25, + STORAGECACHE_GETKEYVALUES_AFTERREADRANGE = 26, + MASTERPROXYSERVER_QUEUETRANSACTIONSTARTREQUESTS_BEFORE = 27, + MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_CONFIRMEPOCHLIVE = 28, + MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_AFTER = 29, + MASTERPROXYSERVER_MASTERPROXYSERVERCORE_BROADCAST = 30, + MASTERPROXYSERVER_MASTERPROXYSERVERCORE_GETRAWCOMMITTEDVERSION = 31, + TLOGSERVER_TLOGCONFIRMRUNNINGREQUEST = 33, + READWRITE_RANDOMREADWRITECLIENT_BEFORE = 34, + READWRITE_RANDOMREADWRITECLIENT_AFTER = 35 + }; + + uint64_t id; + double time; + int32_t location; + + TransactionDebugTrace() {} + TransactionDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + template + void serialize(Ar& ar) { + serializer(ar, id, time, location); + } +}; + void fbTrace(Reference const& traceLine); template From e24017fae3b1d31c783663fdc226f3bdee67ade5 Mon Sep 17 00:00:00 2001 From: negoyal Date: Mon, 6 Apr 2020 17:53:26 -0700 Subject: [PATCH 07/27] Minor definition fixes. --- fdbserver/StorageCache.actor.cpp | 6 ++---- flow/FBTrace.h | 9 ++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 741f43ffa4..5e2471cb9a 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -234,7 +234,6 @@ ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version ve ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { state int64_t resultSize = 0; - state Reference getValueTrace; try { ++data->counters.getValueQueries; ++data->counters.allQueries; @@ -244,15 +243,14 @@ ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - + //TODO what's this? wait( delay(0, TaskPriority::DefaultEndpoint) ); if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); //FIXME - getValueTrace = GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ); - fbTrace(getValueTrace); + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ))); } state Optional v; diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 69aae928d9..3c76a108e0 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -32,7 +32,7 @@ public: }; template -class FDBTrace : FBTraceImpl { +class FDBTrace : public FBTraceImpl { protected: void write(ObjectWriter& writer) override { writer.serialize(*static_cast(this)); @@ -40,8 +40,8 @@ class FDBTrace : FBTraceImpl { }; class GetValueDebugTrace : public FDBTrace { - constexpr static FileIdentifier file_identifier = 617894; public: + constexpr static FileIdentifier file_identifier = 617894; enum codeLocation { STORAGESERVER_GETVALUE_RECEIVED = 0, STORAGESERVER_GETVALUE_DO_READ = 1, @@ -70,6 +70,7 @@ public: }; class WatchValueDebugTrace : public FDBTrace { +public: constexpr static FileIdentifier file_identifier = 14486715; enum codeLocation { STORAGESERVER_WATCHVALUE_BEFORE = 1, @@ -92,6 +93,7 @@ class WatchValueDebugTrace : public FDBTrace { }; class CommitDebugTrace : public FDBTrace { +public: constexpr static FileIdentifier file_identifier = 7691518; enum codeLocation { STORAGESERVER_COMMIT_BEORE = 0, @@ -130,6 +132,7 @@ class CommitDebugTrace : public FDBTrace { }; class TransactionDebugTrace : public FDBTrace { +public: constexpr static FileIdentifier file_identifier = 6868728; enum codeLocation { STORAGESERVER_GETKEYVALUES_BEFORE = 0, @@ -184,7 +187,7 @@ class TransactionDebugTrace : public FDBTrace { void fbTrace(Reference const& traceLine); template -void fbTrace(Reference const& traceLine) { +void fbTrace(Reference traceLine) { static_assert(std::is_base_of::value, "fbTrace only accepts FBTraceImpl as argument"); fbTrace(traceLine.template castTo()); } From be68bd6ad239be0a6cc47ee42f85e929d89310a1 Mon Sep 17 00:00:00 2001 From: negoyal Date: Thu, 9 Apr 2020 16:45:24 -0700 Subject: [PATCH 08/27] Add FBTrace calls for most debugTransaction trace messages. --- fdbclient/NativeAPI.actor.cpp | 105 ++- fdbserver/KeyValueStoreSQLite.actor.cpp | 29 +- fdbserver/MasterProxyServer.actor.cpp | 67 +- fdbserver/Resolver.actor.cpp | 19 +- fdbserver/StorageCache.actor.cpp | 1010 ----------------------- fdbserver/TLogServer.actor.cpp | 25 +- fdbserver/storageserver.actor.cpp | 73 +- fdbserver/workloads/ReadWrite.actor.cpp | 10 +- flow/FBTrace.h | 27 +- 9 files changed, 298 insertions(+), 1067 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 43434998be..636d10c848 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -47,6 +47,7 @@ #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/UnitTest.h" +#include "flow/FBTrace.h" #if defined(CMAKE_BUILD) || !defined(WIN32) #include "versions.h" @@ -1188,8 +1189,12 @@ ACTOR Future< pair> > getKeyLocation_internal( ASSERT( key < allKeys.end ); } - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEYLOCATION_BEFORE))); + } loop { ++cx->transactionKeyServerLocationRequests; @@ -1197,8 +1202,12 @@ ACTOR Future< pair> > getKeyLocation_internal( when ( wait( cx->onMasterProxiesChanged() ) ) {} when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { ++cx->transactionKeyServerLocationRequestsCompleted; - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEYLOCATION_AFTER))); + } ASSERT( rep.results.size() == 1 ); auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second); @@ -1227,8 +1236,12 @@ Future>> getKeyLocation( Database const& } ACTOR Future< vector< pair> > > getKeyRangeLocations_internal( Database cx, KeyRange keys, int limit, bool reverse, TransactionInfo info ) { - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEYLOCATIONS_BEFORE))); + } loop { ++cx->transactionKeyServerLocationRequests; @@ -1237,8 +1250,12 @@ ACTOR Future< vector< pair> > > getKeyRangeLoca when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEYLOCATIONS_AFTER))); + } ASSERT( rep.results.size() ); state vector< pair> > results; @@ -1338,6 +1355,9 @@ ACTOR Future> getValue( Future version, Key key, Databa g_traceBatch.addAttach("GetValueAttachID", info.debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), + GetValueDebugTrace::NATIVEAPI_GETVALUE_BEFORE))); /*TraceEvent("TransactionDebugGetValueInfo", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1382,6 +1402,9 @@ ACTOR Future> getValue( Future version, Key key, Databa if( info.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.After"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), + GetValueDebugTrace::NATIVEAPI_GETVALUE_AFTER))); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1396,6 +1419,9 @@ ACTOR Future> getValue( Future version, Key key, Databa cx->getValueCompleted->log(); if( info.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Error"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), + GetValueDebugTrace::NATIVEAPI_GETVALUE_ERROR))); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1417,8 +1443,12 @@ ACTOR Future> getValue( Future version, Key key, Databa ACTOR Future getKey( Database cx, KeySelector k, Future version, TransactionInfo info ) { wait(success(version)); - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.AfterVersion"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEY_AFTERVERSION))); + } loop { if (k.getKey() == allKeys.end) { @@ -1433,8 +1463,12 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T state pair> ssi = wait( getKeyLocation(cx, locationKey, &StorageServerInterface::getKey, info, k.isBackward()) ); try { - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEY_BEFORE))); + } ++cx->transactionPhysicalReads; state GetKeyReply reply; try { @@ -1452,8 +1486,12 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T ++cx->transactionPhysicalReadsCompleted; throw; } - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETKEY_AFTER))); + } k = reply.sel; if (!k.offset && k.orEqual) { return k.getKey(); @@ -1526,6 +1564,9 @@ ACTOR Future watchValue(Future version, Key key, Optional g_traceBatch.addAttach("WatchValueAttachID", info.debugID.get().first(), watchValueID.get().first()); g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new WatchValueDebugTrace(watchValueID.get().first(), now(), + WatchValueDebugTrace::NATIVEAPI_WATCHVALUE_BEFORE))); } state WatchValueReply resp; choose { @@ -1538,6 +1579,9 @@ ACTOR Future watchValue(Future version, Key key, Optional } if( info.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new WatchValueDebugTrace(watchValueID.get().first(), now(), + WatchValueDebugTrace::NATIVEAPI_WATCHVALUE_AFTER))); } //FIXME: wait for known committed version on the storage server before replying, @@ -1618,6 +1662,9 @@ ACTOR Future> getExactRange( Database cx, Version ver try { if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_BEFORE))); /*TraceEvent("TransactionDebugGetExactRangeInfo", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -1644,8 +1691,12 @@ ACTOR Future> getExactRange( Database cx, Version ver ++cx->transactionPhysicalReadsCompleted; throw; } - if( info.debugID.present() ) + if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_AFTER))); + } output.arena().dependsOn( rep.arena ); output.append( output.arena(), rep.data.begin(), rep.data.size() ); @@ -1904,6 +1955,9 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETRANGE_BEFORE))); /*TraceEvent("TransactionDebugGetRangeInfo", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -1939,6 +1993,9 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_AFTER))); /*TraceEvent("TransactionDebugGetRangeDone", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -2026,6 +2083,9 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETRANGE_ERROR))); TraceEvent("TransactionDebugError", info.debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || @@ -2712,6 +2772,9 @@ ACTOR static Future tryCommit( Database cx, Reference commitID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", info.debugID.get().first(), commitID.get().first()); g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.Before"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), + CommitDebugTrace::NATIVEAPI_COMMIT_BEFORE))); } req.debugID = commitID; @@ -2755,9 +2818,12 @@ ACTOR static Future tryCommit( Database cx, Reference cx->transactionCommittedMutations += req.transaction.mutations.size(); cx->transactionCommittedMutationBytes += req.transaction.mutations.expectedSize(); - if(info.debugID.present()) + if(info.debugID.present()) { g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.After"); - + //FIXME + fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), + CommitDebugTrace::NATIVEAPI_COMMIT_AFTER))); + } double latency = now() - startTime; cx->commitLatencies.addSample(latency); cx->latencies.addSample(now() - tr->startTime); @@ -2807,9 +2873,12 @@ ACTOR static Future tryCommit( Database cx, Reference if (info.debugID.present()) TraceEvent(interval.end()).detail("Conflict", 1); - if(info.debugID.present()) + if(info.debugID.present()) { g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.After"); - + //FIXME + fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), + CommitDebugTrace::NATIVEAPI_COMMIT_AFTER))); + } throw not_committed(); } } @@ -3139,15 +3208,23 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional getConsistentReadVersion( DatabaseContext *cx, uint32_t transactionCount, uint32_t flags, Optional debugID ) { try { ++cx->transactionReadVersionBatches; - if( debugID.present() ) + if( debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETCONSISTENTREADVERSION_BEFORE))); + } loop { state GetReadVersionRequest req( transactionCount, flags, debugID ); choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) { - if( debugID.present() ) + if( debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), + TransactionDebugTrace::NATIVEAPI_GETCONSISTENTREADVERSION_AFTER))); + } ASSERT( v.version > 0 ); cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version); return v; diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index 7f134dc0a0..78d098027b 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -25,6 +25,7 @@ #include "fdbserver/CoroFlow.h" #include "fdbserver/Knobs.h" #include "flow/Hash3.h" +#include "flow/FBTrace.h" extern "C" { #include "fdbserver/sqlite/sqliteInt.h" @@ -1530,12 +1531,22 @@ private: }; void action( ReadValueAction& rv ) { //double t = timer(); - if (rv.debugID.present()) g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); + if (rv.debugID.present()) { + g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), + GetValueDebugTrace::READER_GETVALUE_BEFORE))); + } rv.result.send( getCursor()->get().get(rv.key) ); ++counter; - if (rv.debugID.present()) g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); + if (rv.debugID.present()) { + g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), + GetValueDebugTrace::READER_GETVALUE_AFTER))); + } //t = timer()-t; //if (t >= 1.0) TraceEvent("ReadValueActionSlow",dbgid).detail("Elapsed", t); } @@ -1550,12 +1561,22 @@ private: }; void action( ReadValuePrefixAction& rv ) { //double t = timer(); - if (rv.debugID.present()) g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); + if (rv.debugID.present()) { + g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), + GetValueDebugTrace::READER_GETVALUEPREFIX_BEFORE))); + } rv.result.send( getCursor()->get().getPrefix(rv.key, rv.maxLength) ); ++counter; - if (rv.debugID.present()) g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); + if (rv.debugID.present()) { + g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), + GetValueDebugTrace::READER_GETVALUEPREFIX_AFTER))); + } //t = timer()-t; //if (t >= 1.0) TraceEvent("ReadValuePrefixActionSlow",dbgid).detail("Elapsed", t); } diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 6dc743bd50..12e49272e1 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -45,6 +45,7 @@ #include "flow/ActorCollection.h" #include "flow/Knobs.h" #include "flow/Stats.h" +#include "flow/FBTrace.h" #include "flow/TDMetric.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -191,8 +192,12 @@ ACTOR Future queueTransactionStartRequests( req.reply.send(rep); TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceeded").suppressFor(60); } else { - if (req.debugID.present()) + if (req.debugID.present()) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.queueTransactionStartRequests.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::MASTERPROXYSERVER_QUEUETRANSACTIONSTARTREQUESTS_BEFORE))); + } if (systemQueue->empty() && defaultQueue->empty() && batchQueue->empty()) { forwardPromise(GRVTimer, delayJittered(std::max(0.0, *GRVBatchTime - (now() - *lastGRVTime)), TaskPriority::ProxyGRVTimer)); @@ -496,6 +501,9 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStream(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_BATCHER))); } if(!batch.size()) { @@ -684,8 +692,12 @@ ACTOR Future commitBatch( TraceEvent("SecondCommitBatch", self->dbgid).detail("DebugID", debugID.get()); } - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_BEFORE))); + } /////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield) @@ -694,8 +706,12 @@ ACTOR Future commitBatch( wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); state Future releaseDelay = delay(std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE, batchOperations*self->commitComputePerOperation[latencyBucket]), TaskPriority::ProxyMasterVersionReply); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GettingCommitVersion"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_GETTINGCOMMITVERSION))); + } GetCommitVersionRequest req(self->commitVersionRequestNumber++, self->mostRecentProcessedRequestNumber, self->dbgid); GetCommitVersionReply versionReply = wait( brokenPromiseToNever(self->master.getCommitVersion.getReply(req, TaskPriority::ProxyMasterVersionReply)) ); @@ -715,8 +731,12 @@ ACTOR Future commitBatch( //TraceEvent("ProxyGotVer", self->dbgid).detail("Commit", commitVersion).detail("Prev", prevVersion); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GotCommitVersion"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_GOTCOMMITVERSION))); + } ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version ); int conflictRangeCount = 0; @@ -748,8 +768,12 @@ ACTOR Future commitBatch( /////// Phase 2: Resolution (waiting on the network; pipelined) state vector resolution = wait( getAll(replies) ); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterResolution"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERRESOLUTION))); + } ////// Phase 3: Post-resolution processing (CPU bound except for very rare situations; ordered; currently atomic but doesn't need to be) TEST(self->latestLocalCommitBatchLogging.get() < localBatchNumber - 1); // Queuing post-resolution commit processing @@ -760,8 +784,12 @@ ACTOR Future commitBatch( state double computeDuration = 0; self->stats.txnCommitResolved += trs.size(); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.ProcessingMutations"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_PROCESSINGMUTATIONS))); + } state Arena arena; state bool isMyFirstBatch = !self->version; @@ -1058,8 +1086,12 @@ ACTOR Future commitBatch( state LogSystemDiskQueueAdapter::CommitMessage msg = storeCommits.back().first.get(); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterStoreCommits"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERSTORECOMMITS))); + } // txnState (transaction subsystem state) tag: message extracted from log adapter bool firstMessage = true; @@ -1133,8 +1165,12 @@ ACTOR Future commitBatch( debug_advanceMinCommittedVersion(UID(), commitVersion); //TraceEvent("ProxyPushed", self->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion); - if (debugID.present()) + if (debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterLogPush"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), + CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERLOGPUSH))); + } for (auto &p : storeCommits) { ASSERT(!p.second.isReady()); @@ -1261,6 +1297,9 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.getLiveCommittedVersion.confirmEpochLive"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), + TransactionDebugTrace::MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_CONFIRMEPOCHLIVE))); } vector versions = wait(getAll(proxyVersions)); @@ -1277,6 +1316,9 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.getLiveCommittedVersion.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), + TransactionDebugTrace::MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_AFTER))); } commitData->stats.txnStartOut += transactionCount; @@ -1435,6 +1477,9 @@ ACTOR static Future transactionStarter( if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.masterProxyServerCore.Broadcast"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), + TransactionDebugTrace::MASTERPROXYSERVER_MASTERPROXYSERVERCORE_BROADCAST))); } for (int i = 0; i < start.size(); i++) { @@ -1881,8 +1926,12 @@ ACTOR Future masterProxyServerCore( } when(GetRawCommittedVersionRequest req = waitNext(proxy.getRawCommittedVersion.getFuture())) { //TraceEvent("ProxyGetRCV", proxy.id()); - if (req.debugID.present()) + if (req.debugID.present()) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::MASTERPROXYSERVER_MASTERPROXYSERVERCORE_GETRAWCOMMITTEDVERSION))); + } GetReadVersionReply rep; rep.locked = commitData.locked; rep.metadataVersion = commitData.metadataVersion; diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 64b2959c63..34bd28f5e3 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -30,6 +30,7 @@ #include "fdbserver/ConflictSet.h" #include "fdbserver/StorageMetrics.h" #include "fdbclient/SystemData.h" +#include "flow/FBTrace.h" #include "flow/actorcompiler.h" // This must be the last #include. namespace { @@ -115,6 +116,9 @@ ACTOR Future resolveBatch( debugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), debugID.get().first()); g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.Before"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::RESOLVER_RESOLVEBATCH_BEFORE))); } /*TraceEvent("ResolveBatchStart", self->dbgid).detail("From", proxyAddress).detail("Version", req.version).detail("PrevVersion", req.prevVersion).detail("StateTransactions", req.txnStateTransactions.size()) @@ -132,6 +136,9 @@ ACTOR Future resolveBatch( if(debugID.present()) { g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.AfterQueueSizeCheck"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTERQUEUESIZECHECK))); } loop { @@ -164,8 +171,12 @@ ACTOR Future resolveBatch( Version firstUnseenVersion = proxyInfo.lastVersion + 1; proxyInfo.lastVersion = req.version; - if(req.debugID.present()) + if(req.debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.AfterOrderer"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTERORDERER))); + } ResolveTransactionBatchReply& reply = proxyInfo.outstandingBatches[req.version]; @@ -276,8 +287,12 @@ ACTOR Future resolveBatch( self->checkNeededVersion.trigger(); } - if(req.debugID.present()) + if(req.debugID.present()) { g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTER))); + } } else { TEST(true); // Duplicate resolve batch request diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 5e2471cb9a..37c118f548 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -1,1011 +1 @@ -/* - * StorageCache.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "fdbserver/Knobs.h" -#include "fdbserver/ServerDBInfo.h" -#include "fdbclient/StorageServerInterface.h" -#include "fdbclient/VersionedMap.h" -#include "fdbclient/KeyRangeMap.h" -#include "fdbclient/Atomic.h" -#include "fdbclient/Notified.h" -#include "fdbserver/LogSystem.h" -#include "fdbserver/WaitFailure.h" -#include "fdbserver/WorkerInterface.actor.h" -#include "flow/FBTrace.h" -#include "flow/actorcompiler.h" // This must be the last #include. - - -//TODO storageCache server shares quite a bit of storageServer functionality, although simplified -// Need to look into refactoring common code out for better code readability and to avoid duplication - -//TODO rename wrong_shard_server error to wrong_cache_server -inline bool canReplyWith(Error e) { - switch(e.code()) { - case error_code_transaction_too_old: - case error_code_future_version: - case error_code_wrong_shard_server: - case error_code_process_behind: - //case error_code_all_alternatives_failed: - return true; - default: - return false; - }; -} - -const int VERSION_OVERHEAD = 64 + sizeof(Version) + sizeof(Standalone) + //mutationLog, 64b overhead for map - 2 * (64 + sizeof(Version) + sizeof(Reference::PTreeT>)); //versioned map [ x2 for createNewVersion(version+1) ], 64b overhead for map -static int mvccStorageBytes( MutationRef const& m ) { return VersionedMap::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2; } - -struct StorageCacheData { - typedef VersionedMap VersionedData; -private: - // in-memory versioned struct (PTree as of now. Subject to change) - VersionedData versionedData; - // in-memory mutationLog that the versionedData contains references to - // TODO change it to a deque, already contains mutations in version order - std::map> mutationLog; // versions (durableVersion, version] - -public: - UID thisServerID; // unique id - uint16_t index; // server index - Reference>> logSystem; - Key ck; //cacheKey - KeyRangeMap cachedRangeMap; // map of cached key-ranges - - // The following are in rough order from newest to oldest - // TODO double check which ones we need for storageCache servers - Version lastTLogVersion, lastVersionWithData; - NotifiedVersion version; // current version i.e. the max version that can be read from the cache - NotifiedVersion desiredOldestVersion; // oldestVersion can be increased to this after compaction - NotifiedVersion oldestVersion; // Min version that might be read from the cache - - // TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use - Future compactionInProgress; - - // TODO do we need otherError here? - Promise otherError; - - int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this cache server - bool behind; - - // TODO double check which ones we need for storageCache servers - struct Counters { - CounterCollection cc; - Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, rowsQueried, bytesQueried, watchQueries; - Counter bytesInput, mutationBytes; // Like bytesInput but without MVCC accounting - Counter mutations, setMutations, clearRangeMutations, atomicMutations; - Counter updateBatches, updateVersions; - Counter loops; - Counter readsRejected; - - //LatencyBands readLatencyBands; - - Counters(StorageCacheData* self) - : cc("StorageCacheServer", self->thisServerID.toString()), - getKeyQueries("GetKeyQueries", cc), - getValueQueries("GetValueQueries",cc), - getRangeQueries("GetRangeQueries", cc), - allQueries("QueryQueue", cc), - finishedQueries("FinishedQueries", cc), - rowsQueried("RowsQueried", cc), - bytesQueried("BytesQueried", cc), - watchQueries("WatchQueries", cc), - bytesInput("BytesInput", cc), - mutationBytes("MutationBytes", cc), - mutations("Mutations", cc), - setMutations("SetMutations", cc), - clearRangeMutations("ClearRangeMutations", cc), - atomicMutations("AtomicMutations", cc), - updateBatches("UpdateBatches", cc), - updateVersions("UpdateVersions", cc), - loops("Loops", cc), - readsRejected("ReadsRejected", cc) - { - specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; }); - specialCounter(cc, "Version", [self](){ return self->version.get(); }); - specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); - } - } counters; - - explicit StorageCacheData(UID thisServerID, uint16_t index) - : thisServerID(thisServerID), index(index), - logSystem(new AsyncVar>()), - lastTLogVersion(0), lastVersionWithData(0), - compactionInProgress(Void()), - versionLag(0), behind(false), counters(this) - { - version.initMetric(LiteralStringRef("StorageCacheData.Version"), counters.cc.id); - desiredOldestVersion.initMetric(LiteralStringRef("StorageCacheData.DesriedOldestVersion"), counters.cc.id); - oldestVersion.initMetric(LiteralStringRef("StorageCacheData.OldestVersion"), counters.cc.id); - } - - void addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation); - - bool isReadable( KeyRangeRef const& keys ) { - auto cr = cachedRangeMap.intersectingRanges(keys); - for(auto i = cr.begin(); i != cr.end(); ++i) - if (!i->value()) - return false; - return true; - } - - Arena lastArena; - std::map> const & getMutationLog() { return mutationLog; } - std::map>& getMutableMutationLog() { return mutationLog; } - VersionedData const& data() const { return versionedData; } - VersionedData& mutableData() { return versionedData; } - - Standalone& addVersionToMutationLog(Version v) { - // return existing version... - auto m = mutationLog.find(v); - if (m != mutationLog.end()) - return m->second; - - // ...or create a new one - auto& u = mutationLog[v]; - u.version = v; - if (lastArena.getSize() >= 65536) lastArena = Arena(4096); - u.arena() = lastArena; - counters.bytesInput += VERSION_OVERHEAD; - return u; - } - - MutationRef addMutationToMutationLog(Standalone &mLV, MutationRef const& m){ - //TODO find out more - //byteSampleApplyMutation(m, mLV.version); - counters.bytesInput += mvccStorageBytes(m); - return mLV.mutations.push_back_deep( mLV.arena(), m ); - } - -}; - -///////////////////////////////////// Queries ///////////////////////////////// -#pragma region Queries -ACTOR Future waitForVersion( StorageCacheData* data, Version version ) { - // This could become an Actor transparently, but for now it just does the lookup - if (version == latestVersion) - version = std::max(Version(1), data->version.get()); - if (version < data->oldestVersion.get() || version <= 0) throw transaction_too_old(); - else if (version <= data->version.get()) - return version; - - if(data->behind && version > data->version.get()) { - throw process_behind(); - } - - if(deterministicRandom()->random01() < 0.001) - TraceEvent("WaitForVersion1000x"); - choose { - when ( wait( data->version.whenAtLeast(version) ) ) { - //FIXME: A bunch of these can block with or without the following delay 0. - //wait( delay(0) ); // don't do a whole bunch of these at once - if (version < data->oldestVersion.get()) throw transaction_too_old(); - return version; - } - when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { - if(deterministicRandom()->random01() < 0.001) - TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) - .detail("Version", version) - .detail("MyVersion", data->version.get()) - .detail("ServerID", data->thisServerID); - throw future_version(); - } - } -} - -ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version version ) { - // This could become an Actor transparently, but for now it just does the lookup - if (version == latestVersion) - version = std::max(Version(1), data->version.get()); - if (version <= data->version.get()) - return version; - choose { - when ( wait( data->version.whenAtLeast(version) ) ) { - return version; - } - when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { - if(deterministicRandom()->random01() < 0.001) - TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) - .detail("Version", version) - .detail("MyVersion", data->version.get()) - .detail("ServerID", data->thisServerID); - throw future_version(); - } - } -} - -ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { - state int64_t resultSize = 0; - try { - ++data->counters.getValueQueries; - ++data->counters.allQueries; - //++data->readQueueSizeMetric; - //TODO later - //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); - - // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here - - //TODO what's this? - wait( delay(0, TaskPriority::DefaultEndpoint) ); - - if( req.debugID.present() ) { - g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ))); - } - - state Optional v; - state Version version = wait( waitForVersion( data, req.version ) ); - if( req.debugID.present() ) - g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); - - if (!data->cachedRangeMap[req.key]) { - //TraceEvent("WrongCacheServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); - throw wrong_shard_server(); - } - - state int path = 0; - auto i = data->data().at(version).lastLessOrEqual(req.key); - if (i && i->isValue() && i.key() == req.key) { - v = (Value)i->getValue(); - path = 1; - } - - //debugMutation("CacheGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present()?v.get():LiteralStringRef(""))); - //debugMutation("CacheGetPath", version, MutationRef(MutationRef::DebugKey, req.key, path==0?LiteralStringRef("0"):path==1?LiteralStringRef("1"):LiteralStringRef("2"))); - - if (v.present()) { - ++data->counters.rowsQueried; - resultSize = v.get().size(); - data->counters.bytesQueried += resultSize; - } - - if( req.debugID.present() ) - g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); - - GetValueReply reply(v); - req.reply.send(reply); - } catch (Error& e) { - if(!canReplyWith(e)) - throw; - req.reply.sendError(e); - } - - ++data->counters.finishedQueries; - //--data->readQueueSizeMetric; - //if(data->latencyBandConfig.present()) { - // int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits::max()); - // data->counters.readLatencyBands.addMeasurement(timer() - req.requestTime(), resultSize > maxReadBytes); - //} - - return Void(); -}; - -//TODO Implement the reverse readRange -GetKeyValuesReply readRange(StorageCacheData* data, Version version, KeyRangeRef range, int limit, int* pLimitBytes) { - GetKeyValuesReply result; - StorageCacheData::VersionedData::ViewAtVersion view = data->data().at(version); - StorageCacheData::VersionedData::iterator vCurrent = view.end(); - KeyRef readBegin; - KeyRef rangeBegin = range.begin; - KeyRef rangeEnd = range.end; - - //We might care about a clear beginning before start that runs into range - vCurrent = view.lastLessOrEqual(rangeBegin); - if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > rangeBegin) - readBegin = vCurrent->getEndKey(); - else - readBegin = rangeBegin; - - vCurrent = view.lower_bound(readBegin); - ASSERT(!vCurrent || vCurrent.key() >= readBegin); - if (vCurrent) { - auto b = vCurrent; - --b; - ASSERT(!b || b.key() < readBegin); - } - int accumulatedBytes = 0; - while (vCurrent && vCurrent.key() < rangeEnd && limit > 0 && accumulatedBytes < *pLimitBytes) { - if (!vCurrent->isClearTo()) { - result.data.push_back_deep(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue())); - accumulatedBytes += sizeof(KeyValueRef) + result.data.end()[-1].expectedSize(); - --limit; - } - ++vCurrent; - } - - *pLimitBytes -= accumulatedBytes; - ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0); - result.more = limit == 0 || *pLimitBytes <= 0; // FIXME: Does this have to be exact? - result.version = version; - return result; -} - -Key findKey( StorageCacheData* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) -// Attempts to find the key indicated by sel in the data at version, within range. -// Precondition: selectorInRange(sel, range) -// If it is found, offset is set to 0 and a key is returned which falls inside range. -// If the search would depend on any key outside range OR if the key selector offset is too large (range read returns too many bytes), it returns either -// a negative offset and a key in [range.begin, sel.getKey()], indicating the key is (the first key <= returned key) + offset, or -// a positive offset and a key in (sel.getKey(), range.end], indicating the key is (the first key >= returned key) + offset-1 -// The range passed in to this function should specify a shard. If range.begin is repeatedly not the beginning of a shard, then it is possible to get stuck looping here -{ - ASSERT( version != latestVersion ); - ASSERT( selectorInRange(sel, range) && version >= data->oldestVersion.get()); - - // Count forward or backward distance items, skipping the first one if it == key and skipEqualKey - bool forward = sel.offset > 0; // If forward, result >= sel.getKey(); else result <= sel.getKey() - int sign = forward ? +1 : -1; - bool skipEqualKey = sel.orEqual == forward; - int distance = forward ? sel.offset : 1-sel.offset; - - //Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case) - int maxBytes; - if (sel.offset <= 1 && sel.offset >= 0) - maxBytes = std::numeric_limits::max(); - else - maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES; - - GetKeyValuesReply rep = readRange( data, version, - forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())), - (distance + skipEqualKey)*sign, &maxBytes ); - bool more = rep.more && rep.data.size() != distance + skipEqualKey; - - //If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop - if(more && !forward && rep.data.size() == 1) { - TEST(true); //Reverse key selector returned only one result in range read - maxBytes = std::numeric_limits::max(); - GetKeyValuesReply rep2 = readRange( data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes ); - rep = rep2; - more = rep.more && rep.data.size() != distance + skipEqualKey; - ASSERT(rep.data.size() == 2 || !more); - } - - int index = distance-1; - if (skipEqualKey && rep.data.size() && rep.data[0].key == sel.getKey() ) - ++index; - - if (index < rep.data.size()) { - *pOffset = 0; - return rep.data[ index ].key; - } else { - // FIXME: If range.begin=="" && !forward, return success? - *pOffset = index - rep.data.size() + 1; - if (!forward) *pOffset = -*pOffset; - - if (more) { - TEST(true); // Key selector read range had more results - - ASSERT(rep.data.size()); - Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key; - - //This is possible if key/value pairs are very large and only one result is returned on a last less than query - //SOMEDAY: graceful handling of exceptionally sized values - ASSERT(returnKey != sel.getKey()); - - return returnKey; - } else - return forward ? range.end : range.begin; - } -} - -KeyRange getCachedKeyRange( StorageCacheData* data, const KeySelectorRef& sel ) -// Returns largest range that is cached on this server and selectorInRange(sel, range) or wrong_shard_server if no such range exists -{ - auto i = sel.isBackward() ? data->cachedRangeMap.rangeContainingKeyBefore( sel.getKey() ) : - data->cachedRangeMap.rangeContaining( sel.getKey() ); - if (!i->value()) throw wrong_shard_server(); - ASSERT( selectorInRange(sel, i->range()) ); - return i->range(); -} - -ACTOR Future getKeyValues( StorageCacheData* data, GetKeyValuesRequest req ) -// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents -// all data from being read in one range read -{ - state int64_t resultSize = 0; - - ++data->counters.getRangeQueries; - ++data->counters.allQueries; - //++data->readQueueSizeMetric; - //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); - - // Active load balancing runs at a very high priority (to obtain accurate queue lengths) - // so we need to downgrade here - TaskPriority taskType = TaskPriority::DefaultEndpoint; - if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { - taskType = TaskPriority::FetchKeys; - // } else if (false) { - // // Placeholder for up-prioritizing fetches for important requests - // taskType = TaskPriority::DefaultDelay; - } - wait( delay(0, taskType) ); - - try { - if( req.debugID.present() ) - g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Before"); - state Version version = wait( waitForVersion( data, req.version ) ); - - state KeyRange cachedKeyRange; - try { - cachedKeyRange = getCachedKeyRange(data, req.begin); - - if (req.debugID.present()) - g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), - "storagecache.getKeyValues.AfterVersion"); - //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); - } catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } - - if ( !selectorInRange(req.end, cachedKeyRange) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == cachedKeyRange.end) ) { -// TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents"); - throw wrong_shard_server(); - } - - state int offset1; - state int offset2; - state Key begin = req.begin.isFirstGreaterOrEqual() ? req.begin.getKey() : findKey( data, req.begin, version, cachedKeyRange, &offset1 ); - state Key end = req.end.isFirstGreaterOrEqual() ? req.end.getKey() : findKey( data, req.end, version, cachedKeyRange, &offset2 ); - if( req.debugID.present() ) - g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterKeys"); - //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); - - // Offsets of zero indicate begin/end keys in this cachedKeyRange, which obviously means we can answer the query - // An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. - // A begin offset of 1 is also OK because then either begin is past end or equal to end (so the result is definitely empty) - if ((offset1 && offset1!=1) || (offset2 && offset2!=1)) { - TEST(true); // wrong_cache_server due to offset - // We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, and return a clipped range rather - // than an error (since that is what the NativeAPI.getRange will do anyway via its "slow path"), but we would have to add some flags to the response - // to encode whether we went off the beginning and the end, since it needs that information. - //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); - throw wrong_shard_server(); - } - - if (begin >= end) { - if( req.debugID.present() ) - g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Send"); - //.detail("Begin",begin).detail("End",end); - - GetKeyValuesReply none; - none.version = version; - none.more = false; - req.reply.send( none ); - } else { - state int remainingLimitBytes = req.limitBytes; - - GetKeyValuesReply _r = readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes); - GetKeyValuesReply r = _r; - - if( req.debugID.present() ) - g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterReadRange"); - //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); - if (EXPENSIVE_VALIDATION) { - for (int i = 0; i < r.data.size(); i++) - ASSERT(r.data[i].key >= begin && r.data[i].key < end); - ASSERT(r.data.size() <= std::abs(req.limit)); - } - - req.reply.send( r ); - - resultSize = req.limitBytes - remainingLimitBytes; - data->counters.bytesQueried += resultSize; - data->counters.rowsQueried += r.data.size(); - } - } catch (Error& e) { - if(!canReplyWith(e)) - throw; - req.reply.sendError(e); - } - - ++data->counters.finishedQueries; - - return Void(); -} - -ACTOR Future getKey( StorageCacheData* data, GetKeyRequest req ) { - state int64_t resultSize = 0; - - ++data->counters.getKeyQueries; - ++data->counters.allQueries; - - // Active load balancing runs at a very high priority (to obtain accurate queue lengths) - // so we need to downgrade here - wait( delay(0, TaskPriority::DefaultEndpoint) ); - - try { - state Version version = wait( waitForVersion( data, req.version ) ); - state KeyRange cachedKeyRange = getCachedKeyRange( data, req.sel ); - - state int offset; - Key k = findKey( data, req.sel, version, cachedKeyRange, &offset ); - - KeySelector updated; - if (offset < 0) - updated = firstGreaterOrEqual(k)+offset; // first thing on this shard OR (large offset case) smallest key retrieved in range read - else if (offset > 0) - updated = firstGreaterOrEqual(k)+offset-1; // first thing on next shard OR (large offset case) keyAfter largest key retrieved in range read - else - updated = KeySelectorRef(k,true,0); //found - - resultSize = k.size(); - data->counters.bytesQueried += resultSize; - ++data->counters.rowsQueried; - - GetKeyReply reply(updated); - req.reply.send(reply); - } - catch (Error& e) { - if (e.code() == error_code_wrong_shard_server) TraceEvent("WrongShardServer").detail("In","getKey"); - if(!canReplyWith(e)) - throw; - req.reply.sendError(e); - } - - ++data->counters.finishedQueries; - - return Void(); -} - -#pragma endregion - -bool expandMutation( MutationRef& m, StorageCacheData::VersionedData const& data, KeyRef eagerTrustedEnd, Arena& ar ) { - // After this function call, m should be copied into an arena immediately (before modifying data, shards, or eager) - if (m.type == MutationRef::ClearRange) { - // Expand the clear - const auto& d = data.atLatest(); - - // If another clear overlaps the beginning of this one, engulf it - auto i = d.lastLess(m.param1); - if (i && i->isClearTo() && i->getEndKey() >= m.param1) - m.param1 = i.key(); - - // If another clear overlaps the end of this one, engulf it; otherwise expand - i = d.lastLessOrEqual(m.param2); - if (i && i->isClearTo() && i->getEndKey() >= m.param2) { - m.param2 = i->getEndKey(); - } else { - // Expand to the next set or clear (from storage or latestVersion), and if it - // is a clear, engulf it as well - i = d.lower_bound(m.param2); - //KeyRef endKeyAtStorageVersion = m.param2 == eagerTrustedEnd ? eagerTrustedEnd : std::min( eager->getKeyEnd( m.param2 ), eagerTrustedEnd ); - // TODO check if the following is correct - KeyRef endKeyAtStorageVersion = eagerTrustedEnd; - if (!i || endKeyAtStorageVersion < i.key()) - m.param2 = endKeyAtStorageVersion; - else if (i->isClearTo()) - m.param2 = i->getEndKey(); - else - m.param2 = i.key(); - } - } - else if (m.type != MutationRef::SetValue && (m.type)) { - - Optional oldVal; - auto it = data.atLatest().lastLessOrEqual(m.param1); - if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1) - oldVal = it->getValue(); - else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) { - TEST(true); // Atomic op right after a clear. - } - - switch(m.type) { - case MutationRef::AddValue: - m.param2 = doLittleEndianAdd(oldVal, m.param2, ar); - break; - case MutationRef::And: - m.param2 = doAnd(oldVal, m.param2, ar); - break; - case MutationRef::Or: - m.param2 = doOr(oldVal, m.param2, ar); - break; - case MutationRef::Xor: - m.param2 = doXor(oldVal, m.param2, ar); - break; - case MutationRef::AppendIfFits: - m.param2 = doAppendIfFits(oldVal, m.param2, ar); - break; - case MutationRef::Max: - m.param2 = doMax(oldVal, m.param2, ar); - break; - case MutationRef::Min: - m.param2 = doMin(oldVal, m.param2, ar); - break; - case MutationRef::ByteMin: - m.param2 = doByteMin(oldVal, m.param2, ar); - break; - case MutationRef::ByteMax: - m.param2 = doByteMax(oldVal, m.param2, ar); - break; - case MutationRef::MinV2: - m.param2 = doMinV2(oldVal, m.param2, ar); - break; - case MutationRef::AndV2: - m.param2 = doAndV2(oldVal, m.param2, ar); - break; - case MutationRef::CompareAndClear: - if (oldVal.present() && m.param2 == oldVal.get()) { - m.type = MutationRef::ClearRange; - m.param2 = keyAfter(m.param1, ar); - return expandMutation(m, data, eagerTrustedEnd, ar); - } - return false; - } - m.type = MutationRef::SetValue; - } - - return true; -} - -// Applies a write mutation (SetValue or ClearRange) to the in-memory versioned data structure -void applyMutation( StorageCacheData *self, MutationRef const& m, Arena& arena, StorageCacheData::VersionedData &data ) { - // m is expected to be in arena already - // Clear split keys are added to arena - - if (m.type == MutationRef::SetValue) { - auto prev = data.atLatest().lastLessOrEqual(m.param1); - if (prev && prev->isClearTo() && prev->getEndKey() > m.param1) { - ASSERT( prev.key() <= m.param1 ); - KeyRef end = prev->getEndKey(); - // TODO double check if the insert version of the previous clear needs to be preserved for the "left half", - // insert() invalidates prev, so prev.key() is not safe to pass to it by reference - data.insert( KeyRef(prev.key()), ValueOrClearToRef::clearTo( m.param1 ), prev.insertVersion() ); // overwritten by below insert if empty - KeyRef nextKey = keyAfter(m.param1, arena); - if ( end != nextKey ) { - ASSERT( end > nextKey ); - // TODO double check if it's okay to let go of the the insert version of the "right half" - // FIXME: This copy is technically an asymptotic problem, definitely a waste of memory (copy of keyAfter is a waste, but not asymptotic) - data.insert( nextKey, ValueOrClearToRef::clearTo( KeyRef(arena, end) ) ); - } - } - data.insert( m.param1, ValueOrClearToRef::value(m.param2) ); - } else if (m.type == MutationRef::ClearRange) { - data.erase( m.param1, m.param2 ); - ASSERT( m.param2 > m.param1 ); - ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); - data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); - } -} - -template -void splitMutation(StorageCacheData* data, KeyRangeMap& map, MutationRef const& m, Version ver) { - if(isSingleKeyMutation((MutationRef::Type) m.type)) { - auto i = map.rangeContaining(m.param1); - if (i->value()) // If this key lies in the cached key-range on this server - data->addMutation( i->range(), ver, m ); - } - else if (m.type == MutationRef::ClearRange) { - KeyRangeRef mKeys( m.param1, m.param2 ); - auto r = map.intersectingRanges( mKeys ); - for(auto i = r.begin(); i != r.end(); ++i) { - if (i->value()) { // if this sub-range exists on this cache server - KeyRangeRef k = mKeys & i->range(); - data->addMutation( i->range(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) ); - } - } - } else - ASSERT(false); // Unknown mutation type in splitMutations -} - -void StorageCacheData::addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation) { - MutationRef expanded = mutation; - auto& mLog = addVersionToMutationLog(version); - - if ( !expandMutation( expanded, data(), cachedKeyRange.end, mLog.arena()) ) { - return; - } - expanded = addMutationToMutationLog(mLog, expanded); - if (debugMutation("expandedMutation", version, expanded)) { - const char* type = - mutation.type == MutationRef::SetValue ? "SetValue" : - mutation.type == MutationRef::ClearRange ? "ClearRange" : - mutation.type == MutationRef::DebugKeyRange ? "DebugKeyRange" : - mutation.type == MutationRef::DebugKey ? "DebugKey" : - "UnknownMutation"; - printf("DEBUGMUTATION:\t%.6f\t%s\t%s\t%s\t%s\t%s\n", - now(), g_network->getLocalAddress().toString().c_str(), "originalMutation", - type, printable(mutation.param1).c_str(), printable(mutation.param2).c_str()); - printf(" Cached Key-range: %s - %s\n", printable(cachedKeyRange.begin).c_str(), printable(cachedKeyRange.end).c_str()); - } - applyMutation( this, expanded, mLog.arena(), mutableData() ); - printf("\nSCUpdate: Printing versioned tree after applying mutation\n"); - mutableData().printTree(version); - -} - -// Helper class for updating the storage cache (i.e. applying mutations) -class StorageCacheUpdater { -public: - StorageCacheUpdater() : currentVersion(invalidVersion), processedCacheStartKey(false) {} - StorageCacheUpdater(Version currentVersion) : currentVersion(currentVersion), processedCacheStartKey(false) {} - - void applyMutation(StorageCacheData* data, MutationRef const& m , Version ver) { - //TraceEvent("SCNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); - - if(currentVersion != ver) { - currentVersion = ver; - data->mutableData().createNewVersion(ver); - } - - if (m.param1.startsWith( systemKeys.end )) { - //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); - applyPrivateCacheData( data, m ); - } else { - // FIXME: enable when debugMutation is active - //for(auto m = changes[c].mutations.begin(); m; ++m) { - // debugMutation("SCUpdateMutation", changes[c].version, *m); - //} - - splitMutation(data, data->cachedRangeMap, m, ver); - } - - //TODO - if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); - } - - Version currentVersion; -private: - KeyRef cacheStartKey; - bool nowAssigned; - bool processedCacheStartKey; - - // Applies private mutations, as the name suggests. It's basically establishes the key-ranges - //that this cache server is responsible for - // TODO Revisit during failure handling. Might we loose some private mutations? - void applyPrivateCacheData( StorageCacheData* data, MutationRef const& m ) { - TraceEvent(SevDebug, "SCPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); - - if (processedCacheStartKey) { - // we expect changes in pairs, [begin,end). This mutation is for end key of the range - ASSERT (m.type == MutationRef::SetValue && m.param1.startsWith(data->ck)); - KeyRangeRef keys( cacheStartKey.removePrefix(data->ck), m.param1.removePrefix(data->ck)); - data->cachedRangeMap.insert(keys, true); - fprintf(stderr, "SCPrivateCacheMutation: begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); - - processedCacheStartKey = false; - } else if (m.type == MutationRef::SetValue && m.param1.startsWith( data->ck )) { - // We expect changes in pairs, [begin,end), This mutation is for start key of the range - cacheStartKey = m.param1; - processedCacheStartKey = true; - } else { - fprintf(stderr, "SCPrivateCacheMutation: Unknown private mutation\n"); - ASSERT(false); // Unknown private mutation - } - } -}; - -// Compacts the in-memory VersionedMap, i.e. removes versions below the desiredOldestVersion -// TODO revisit if we change the data structure -ACTOR Future compactCache(StorageCacheData* data) { - loop { - //TODO understand this, should we add delay here? - //if (g_network->isSimulated()) { - // double endTime = g_simulator.checkDisabled(format("%s/compactCache", data->thisServerID.toString().c_str())); - // if(endTime > now()) { - // wait(delay(endTime - now(), TaskPriority::CompactCache)); - // } - //} - - // Wait until the desiredOldestVersion is greater than the current oldestVersion - wait( data->desiredOldestVersion.whenAtLeast( data->oldestVersion.get()+1 ) ); - wait( delay(0, TaskPriority::CompactCache) ); - - //TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use - state Promise compactionInProgress; - data->compactionInProgress = compactionInProgress.getFuture(); - state Version desiredVersion = data->desiredOldestVersion.get(); - // Call the compaction routine that does the actual work, - // TODO It's a synchronous function call as of now. Should it asynch? - data->mutableData().compact(desiredVersion); - Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( desiredVersion, - TaskPriority::CompactCache ); - data->oldestVersion.set( desiredVersion ); - wait( finishedForgetting ); - // TODO how do we yield here? This may not be enough, because compact() does the heavy lifting - // of compating the VersionedMap. We should probably look into per version compaction and then - // we can yield after compacting one version - wait( yield(TaskPriority::CompactCache) ); - - // TODO what flowlock to acquire during compaction? - compactionInProgress.send(Void()); - wait( delay(0, TaskPriority::CompactCache) ); //Setting compactionInProgess could cause the cache server to shut down, so delay to check for cancellation - } -} - -ACTOR Future pullAsyncData( StorageCacheData *data ) { - state Future dbInfoChange = Void(); - state Reference r; - state Version tagAt = 0; - - state StorageCacheUpdater updater(data->lastVersionWithData); - state Version ver = invalidVersion; - //data->lastTLogVersion = r->getMaxKnownVersion(); - //data->versionLag = std::max(0, data->lastTLogVersion - data->version.get()); - ++data->counters.updateBatches; - - loop { - loop { - choose { - when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { - break; - } - when( wait( dbInfoChange ) ) { - if( data->logSystem->get() ) - r = data->logSystem->get()->peek( data->thisServerID, tagAt, Optional(), cacheTag, true ); - else - r = Reference(); - dbInfoChange = data->logSystem->onChange(); - } - } - } - //FIXME: if the popped version is greater than our last version, we need to clear the cache - - //FIXME: ensure this can only read data from the current version - r->setProtocolVersion(currentProtocolVersion); - - // Now process the mutations - for (; r->hasMessage(); r->nextMessage()) { - ArenaReader& reader = *r->reader(); - - MutationRef msg; - reader >> msg; - fprintf(stderr, "%lld : %s\n", r->version().version, msg.toString().c_str()); - - if (r->version().version > ver && r->version().version > data->version.get()) { - ++data->counters.updateVersions; - ver = r->version().version; - } - if (ver != invalidVersion) // This change belongs to a version < minVersion - { - updater.applyMutation(data, msg, ver); - // TODO - //mutationBytes += msg.totalSize(); - data->counters.mutationBytes += msg.totalSize(); - ++data->counters.mutations; - switch(msg.type) { - case MutationRef::SetValue: - ++data->counters.setMutations; - break; - case MutationRef::ClearRange: - ++data->counters.clearRangeMutations; - break; - case MutationRef::AddValue: - case MutationRef::And: - case MutationRef::AndV2: - case MutationRef::AppendIfFits: - case MutationRef::ByteMax: - case MutationRef::ByteMin: - case MutationRef::Max: - case MutationRef::Min: - case MutationRef::MinV2: - case MutationRef::Or: - case MutationRef::Xor: - case MutationRef::CompareAndClear: - ++data->counters.atomicMutations; - break; - } - } - else - TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", r->version().toString()); - - tagAt = r->version().version + 1; - } - - if(ver != invalidVersion) { - data->lastVersionWithData = ver; - } else { - // TODO double check - ver = r->version().version - 1; - } - - if(ver != invalidVersion && ver > data->version.get()) { - debugKeyRange("SCUpdate", ver, allKeys); - - data->mutableData().createNewVersion(ver); - - // TODO what about otherError - if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); - - // TODO may enable these later - //data->noRecentUpdates.set(false); - //data->lastUpdate = now(); - data->version.set( ver ); // Triggers replies to waiting gets for new version(s) - // TODO double check - //setDataVersion(data->thisServerID, data->version.get()); - - // TODO what about otherError - if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); - - // we can get rid of versions beyond maxVerionsInMemory at any point. Update the - //desiredOldestVersion and that may invoke the compaction actor - Version maxVersionsInMemory = SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS; - Version proposedOldestVersion = data->version.get() - maxVersionsInMemory; - proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get()); - data->desiredOldestVersion.set(proposedOldestVersion); - } - - // TODO implement a validate function for the cache - //validate(data); - - if(r->version().version >= data->lastTLogVersion) { - if(data->behind) { - TraceEvent("StorageCacheNoLongerBehind", data->thisServerID).detail("CursorVersion", r->version().version).detail("TLogVersion", data->lastTLogVersion); - } - data->behind = false; - } - - tagAt = std::max( tagAt, r->version().version); - } -} - -ACTOR Future storageCache(StorageServerInterface ssi, uint16_t id, Reference> db) { - state StorageCacheData self(ssi.id(), id); - state ActorCollection actors(false); - state Future dbInfoChange = Void(); - - // This helps identify the private mutations meant for this cache server - self.ck = cacheKeysPrefixFor( id ).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/ - - actors.add(waitFailureServer(ssi.waitFailure.getFuture())); - - // compactCache actor will periodically compact the cache when certain version condityion is met - actors.add(compactCache(&self)); - - // pullAsyncData actor pulls mutations from the TLog and also applies them. - actors.add(pullAsyncData(&self)); - - loop { - ++self.counters.loops; - choose { - when( wait( dbInfoChange ) ) { - dbInfoChange = db->onChange(); - self.logSystem->set(ILogSystem::fromServerDBInfo( ssi.id(), db->get(), true )); - } - when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { - // TODO do we need to add throttling for cache servers? Probably not - //actors.add(self->readGuard(req , getValueQ)); - actors.add(getValueQ(&self, req)); - } - when( WatchValueRequest req = waitNext(ssi.watchValue.getFuture()) ) { - ASSERT(false); - } - when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { - actors.add(getKey(&self, req)); - } - when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { - actors.add(getKeyValues(&self, req)); - } - when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { - ASSERT(false); - } - when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { - ASSERT(false); - } - //when( ReplyPromise reply = waitNext(ssi.getVersion.getFuture()) ) { - // ASSERT(false); - //} - when( ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) { - ASSERT(false); - } - when(wait(actors.getResult())) {} - } - } -} diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 7ce0bb5d79..e1a26a3042 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -21,6 +21,7 @@ #include "flow/Hash3.h" #include "flow/Stats.h" #include "flow/UnitTest.h" +#include "flow/FBTrace.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/Notified.h" #include "fdbclient/KeyRangeMap.h" @@ -1766,6 +1767,9 @@ ACTOR Future tLogCommit( tlogDebugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), tlogDebugID.get().first()); g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::TLOG_TLOGCOMMIT_BEFOREWAITFORVERSION))); } logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion); @@ -1795,8 +1799,12 @@ ACTOR Future tLogCommit( } if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on critical section between here self->version.set() below!) - if(req.debugID.present()) + if(req.debugID.present()) { g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::TLOG_TLOGCOMMIT_BEFORE))); + } //TraceEvent("TLogCommit", logData->logId).detail("Version", req.version); commitMessages(self, logData, req.version, req.arena, req.messages); @@ -1819,8 +1827,12 @@ ACTOR Future tLogCommit( // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( req.version ); - if(req.debugID.present()) + if(req.debugID.present()) { g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::TLOG_TLOGCOMMIT_AFTERTLOGCOMMIT))); + } } // Send replies only once all prior messages have been received and committed. state Future stopped = logData->stopCommit.onTrigger(); @@ -1832,8 +1844,12 @@ ACTOR Future tLogCommit( return Void(); } - if(req.debugID.present()) + if(req.debugID.present()) { g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After"); + //FIXME + fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), + CommitDebugTrace::TLOG_TLOGCOMMIT_AFTER))); + } req.reply.send( logData->durableKnownCommittedVersion ); return Void(); @@ -2099,6 +2115,9 @@ ACTOR Future serveTLogInterface( TLogData* self, TLogInterface tli, Refere UID tlogDebugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), tlogDebugID.first()); g_traceBatch.addEvent("TransactionDebug", tlogDebugID.first(), "TLogServer.TLogConfirmRunningRequest"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::TLOGSERVER_TLOGCONFIRMRUNNINGREQUEST))); } if (!logData->stopped) req.reply.send(Void()); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index e8a9bbca76..59d337c449 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -51,6 +51,7 @@ #include "fdbrpc/sim_validation.h" #include "fdbrpc/Smoother.h" #include "flow/Stats.h" +#include "flow/FBTrace.h" #include "flow/TDMetric.actor.h" #include #include "flow/actorcompiler.h" // This must be the last #include. @@ -838,13 +839,21 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // so we need to downgrade here wait( delay(0, TaskPriority::DefaultEndpoint) ); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), + GetValueDebugTrace::STORAGESERVER_GETVALUE_DOREAD))); + } state Optional v; state Version version = wait( waitForVersion( data, req.version ) ); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), + GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTERVERSION))); + } state uint64_t changeCounter = data->shardChangeCounter; @@ -897,8 +906,12 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { data->metrics.notifyBytesReadPerKSecond(req.key, bytesReadPerKSecond); } - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), + GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTERREAD))); + } GetValueReply reply(v); reply.penalty = data->getPenalty(); @@ -923,12 +936,20 @@ ACTOR Future watchValue_impl( StorageServer* data, WatchValueRequest req ) try { ++data->counters.watchQueries; - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.Before"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), + WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_BEFORE))); + } wait(success(waitForVersionNoTooOld(data, req.version))); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), + WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_AFTERVERSION))); + } loop { try { @@ -945,8 +966,12 @@ ACTOR Future watchValue_impl( StorageServer* data, WatchValueRequest req ) debugMutation("ShardWatchValue", latest, MutationRef(MutationRef::DebugKey, req.key, reply.value.present() ? StringRef( reply.value.get() ) : LiteralStringRef("") ) ); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), + WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_AFTERREAD))); + } if( reply.value != req.value ) { req.reply.send(WatchValueReply{ latest }); @@ -1351,16 +1376,24 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) wait( delay(0, taskType) ); try { - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_BEFORE))); + } state Version version = wait( waitForVersion( data, req.version ) ); state uint64_t changeCounter = data->shardChangeCounter; // try { state KeyRange shard = getShardKeyRange( data, req.begin ); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterVersion"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERVERSION))); + } //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); //} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } @@ -1375,8 +1408,12 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) state Future fEnd = req.end.isFirstGreaterOrEqual() ? Future(req.end.getKey()) : findKey( data, req.end, version, shard, &offset2 ); state Key begin = wait(fBegin); state Key end = wait(fEnd); - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERKEYS))); + } //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); // Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query @@ -1392,8 +1429,12 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) } if (begin >= end) { - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_SEND))); + } //.detail("Begin",begin).detail("End",end); GetKeyValuesReply none; @@ -1409,8 +1450,12 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) GetKeyValuesReply _r = wait( readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes) ); GetKeyValuesReply r = _r; - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), + TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERREADRANGE))); + } //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); data->checkChangeCounter( changeCounter, KeyRangeRef( std::min(begin, std::min(req.begin.getKey(), req.end.getKey())), std::max(end, std::max(req.begin.getKey(), req.end.getKey())) ) ); if (EXPENSIVE_VALIDATION) { @@ -3602,8 +3647,12 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac } when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work - if( req.debugID.present() ) + if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), + GetValueDebugTrace::STORAGESERVER_GETVALUE_RECEIVED))); + } if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key)) req.reply.send(GetValueReply()); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index cd7cc918c3..ce9623a079 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -31,6 +31,7 @@ #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" +#include "flow/FBTrace.h" #include "flow/actorcompiler.h" // This must be the last #include. const int sampleSize = 10000; @@ -618,6 +619,9 @@ struct ReadWriteWorkload : KVWorkload { debugID = deterministicRandom()->randomUniqueID(); tr.debugTransaction(debugID); g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.Before"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.first(), now(), + TransactionDebugTrace::READWRITE_RANDOMREADWRITECLIENT_BEFORE))); } else { debugID = UID(); @@ -690,8 +694,12 @@ struct ReadWriteWorkload : KVWorkload { } } - if(debugID != UID()) + if(debugID != UID()) { g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.After"); + //FIXME + fbTrace(Reference(new TransactionDebugTrace(debugID.first(), now(), + TransactionDebugTrace::READWRITE_RANDOMREADWRITECLIENT_AFTER))); + } tr = Trans(); diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 3c76a108e0..13404e2d47 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -44,17 +44,20 @@ public: constexpr static FileIdentifier file_identifier = 617894; enum codeLocation { STORAGESERVER_GETVALUE_RECEIVED = 0, - STORAGESERVER_GETVALUE_DO_READ = 1, - STORAGESERVER_GETVALUE_AFTER_VERSION = 2, - STORAGESERVER_GETVALUE_AFTER_READ = 3, + STORAGESERVER_GETVALUE_DOREAD = 1, + STORAGESERVER_GETVALUE_AFTERVERSION = 2, + STORAGESERVER_GETVALUE_AFTERREAD = 3, STORAGECACHE_GETVALUE_RECEIVED = 4, - STORAGECACHE_GETVALUE_DO_READ = 5, - STORAGECACHE_GETVALUE_AFTER_VERSION = 6, - STORAGECACHE_GETVALUE_AFTER_READ = 7, + STORAGECACHE_GETVALUE_DOREAD = 5, + STORAGECACHE_GETVALUE_AFTERVERSION = 6, + STORAGECACHE_GETVALUE_AFTERREAD = 7, READER_GETVALUE_BEFORE = 8, READER_GETVALUE_AFTER = 9, READER_GETVALUEPREFIX_BEFORE = 10, - READER_GETVALUEPREFIX_AFTER = 11 + READER_GETVALUEPREFIX_AFTER = 11, + NATIVEAPI_GETVALUE_BEFORE = 12, + NATIVEAPI_GETVALUE_AFTER = 13, + NATIVEAPI_GETVALUE_ERROR = 14 }; uint64_t id; @@ -74,10 +77,10 @@ public: constexpr static FileIdentifier file_identifier = 14486715; enum codeLocation { STORAGESERVER_WATCHVALUE_BEFORE = 1, - STORAGESERVER_WATCHVALUE_AFTER_VERSION = 2, - STORAGESERVER_WATCHVALUE_AFTER_READ = 3, + STORAGESERVER_WATCHVALUE_AFTERVERSION = 2, + STORAGESERVER_WATCHVALUE_AFTERREAD = 3, NATIVEAPI_WATCHVALUE_BEFORE = 4, - NATIVEAPI_WATCHVALUE_AFTER_READ = 5 + NATIVEAPI_WATCHVALUE_AFTER = 5 }; uint64_t id; @@ -99,9 +102,9 @@ public: STORAGESERVER_COMMIT_BEORE = 0, STORAGESERVER_COMMIT_AFTER_VERSION = 1, STORAGESERVER_COMMIT_AFTER_READ = 2, - NATIVEAPI_COMMIT_BEORE = 3, + NATIVEAPI_COMMIT_BEFORE = 3, NATIVEAPI_COMMIT_AFTER = 4, - MASTERROXYSERVER_BATCHER = 5, + MASTERPROXYSERVER_BATCHER = 5, MASTERPROXYSERVER_COMMITBATCH_BEFORE = 6, MASTERPROXYSERVER_COMMITBATCH_GETTINGCOMMITVERSION = 7, MASTERPROXYSERVER_COMMITBATCH_GOTCOMMITVERSION = 8, From e22b88d857785bb7cfac25410dc30eab1d268bcc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Thu, 16 Apr 2020 13:14:27 -0700 Subject: [PATCH 09/27] Revert "Add FBTrace calls for most debugTransaction trace messages." This reverts commit be68bd6ad239be0a6cc47ee42f85e929d89310a1. --- fdbclient/NativeAPI.actor.cpp | 105 +-- fdbserver/KeyValueStoreSQLite.actor.cpp | 29 +- fdbserver/MasterProxyServer.actor.cpp | 67 +- fdbserver/Resolver.actor.cpp | 19 +- fdbserver/StorageCache.actor.cpp | 1010 +++++++++++++++++++++++ fdbserver/TLogServer.actor.cpp | 25 +- fdbserver/storageserver.actor.cpp | 73 +- fdbserver/workloads/ReadWrite.actor.cpp | 10 +- flow/FBTrace.h | 27 +- 9 files changed, 1067 insertions(+), 298 deletions(-) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 6305e2234b..f7f71865f8 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -47,7 +47,6 @@ #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "flow/UnitTest.h" -#include "flow/FBTrace.h" #if defined(CMAKE_BUILD) || !defined(WIN32) #include "versions.h" @@ -1198,12 +1197,8 @@ ACTOR Future< pair> > getKeyLocation_internal( ASSERT( key < allKeys.end ); } - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEYLOCATION_BEFORE))); - } loop { ++cx->transactionKeyServerLocationRequests; @@ -1211,12 +1206,8 @@ ACTOR Future< pair> > getKeyLocation_internal( when ( wait( cx->onMasterProxiesChanged() ) ) {} when ( GetKeyServerLocationsReply rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { ++cx->transactionKeyServerLocationRequestsCompleted; - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEYLOCATION_AFTER))); - } ASSERT( rep.results.size() == 1 ); auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second); @@ -1245,12 +1236,8 @@ Future>> getKeyLocation( Database const& } ACTOR Future< vector< pair> > > getKeyRangeLocations_internal( Database cx, KeyRange keys, int limit, bool reverse, TransactionInfo info ) { - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEYLOCATIONS_BEFORE))); - } loop { ++cx->transactionKeyServerLocationRequests; @@ -1259,12 +1246,8 @@ ACTOR Future< vector< pair> > > getKeyRangeLoca when ( GetKeyServerLocationsReply _rep = wait( loadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEYLOCATIONS_AFTER))); - } ASSERT( rep.results.size() ); state vector< pair> > results; @@ -1364,9 +1347,6 @@ ACTOR Future> getValue( Future version, Key key, Databa g_traceBatch.addAttach("GetValueAttachID", info.debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), - GetValueDebugTrace::NATIVEAPI_GETVALUE_BEFORE))); /*TraceEvent("TransactionDebugGetValueInfo", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1411,9 +1391,6 @@ ACTOR Future> getValue( Future version, Key key, Databa if( info.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.After"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), - GetValueDebugTrace::NATIVEAPI_GETVALUE_AFTER))); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1428,9 +1405,6 @@ ACTOR Future> getValue( Future version, Key key, Databa cx->getValueCompleted->log(); if( info.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Error"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(getValueID.get().first(), now(), - GetValueDebugTrace::NATIVEAPI_GETVALUE_ERROR))); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) @@ -1452,12 +1426,8 @@ ACTOR Future> getValue( Future version, Key key, Databa ACTOR Future getKey( Database cx, KeySelector k, Future version, TransactionInfo info ) { wait(success(version)); - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.AfterVersion"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEY_AFTERVERSION))); - } loop { if (k.getKey() == allKeys.end) { @@ -1472,12 +1442,8 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T state pair> ssi = wait( getKeyLocation(cx, locationKey, &StorageServerInterface::getKey, info, k.isBackward()) ); try { - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEY_BEFORE))); - } ++cx->transactionPhysicalReads; state GetKeyReply reply; try { @@ -1495,12 +1461,8 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T ++cx->transactionPhysicalReadsCompleted; throw; } - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", reply.sel.offset).detail("OrEqual", k.orEqual); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETKEY_AFTER))); - } k = reply.sel; if (!k.offset && k.orEqual) { return k.getKey(); @@ -1573,9 +1535,6 @@ ACTOR Future watchValue(Future version, Key key, Optional g_traceBatch.addAttach("WatchValueAttachID", info.debugID.get().first(), watchValueID.get().first()); g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new WatchValueDebugTrace(watchValueID.get().first(), now(), - WatchValueDebugTrace::NATIVEAPI_WATCHVALUE_BEFORE))); } state WatchValueReply resp; choose { @@ -1588,9 +1547,6 @@ ACTOR Future watchValue(Future version, Key key, Optional } if( info.debugID.present() ) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new WatchValueDebugTrace(watchValueID.get().first(), now(), - WatchValueDebugTrace::NATIVEAPI_WATCHVALUE_AFTER))); } //FIXME: wait for known committed version on the storage server before replying, @@ -1671,9 +1627,6 @@ ACTOR Future> getExactRange( Database cx, Version ver try { if( info.debugID.present() ) { g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_BEFORE))); /*TraceEvent("TransactionDebugGetExactRangeInfo", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -1700,12 +1653,8 @@ ACTOR Future> getExactRange( Database cx, Version ver ++cx->transactionPhysicalReadsCompleted; throw; } - if( info.debugID.present() ) { + if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getExactRange.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_AFTER))); - } output.arena().dependsOn( rep.arena ); output.append( output.arena(), rep.data.begin(), rep.data.size() ); @@ -1964,9 +1913,6 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETRANGE_BEFORE))); /*TraceEvent("TransactionDebugGetRangeInfo", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -2002,9 +1948,6 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETEXACTRANGE_AFTER))); /*TraceEvent("TransactionDebugGetRangeDone", info.debugID.get()) .detail("ReqBeginKey", req.begin.getKey()) .detail("ReqEndKey", req.end.getKey()) @@ -2092,9 +2035,6 @@ ACTOR Future> getRange( Database cx, Reference(new TransactionDebugTrace(info.debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETRANGE_ERROR))); TraceEvent("TransactionDebugError", info.debugID.get()).error(e); } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed || @@ -2783,9 +2723,6 @@ ACTOR static Future tryCommit( Database cx, Reference commitID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", info.debugID.get().first(), commitID.get().first()); g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.Before"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), - CommitDebugTrace::NATIVEAPI_COMMIT_BEFORE))); } req.debugID = commitID; @@ -2829,12 +2766,9 @@ ACTOR static Future tryCommit( Database cx, Reference cx->transactionCommittedMutations += req.transaction.mutations.size(); cx->transactionCommittedMutationBytes += req.transaction.mutations.expectedSize(); - if(info.debugID.present()) { + if(info.debugID.present()) g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.After"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), - CommitDebugTrace::NATIVEAPI_COMMIT_AFTER))); - } + double latency = now() - startTime; cx->commitLatencies.addSample(latency); cx->latencies.addSample(now() - tr->startTime); @@ -2884,12 +2818,9 @@ ACTOR static Future tryCommit( Database cx, Reference if (info.debugID.present()) TraceEvent(interval.end()).detail("Conflict", 1); - if(info.debugID.present()) { + if(info.debugID.present()) g_traceBatch.addEvent("CommitDebug", commitID.get().first(), "NativeAPI.commit.After"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(commitID.get().first(), now(), - CommitDebugTrace::NATIVEAPI_COMMIT_AFTER))); - } + throw not_committed(); } } @@ -3219,23 +3150,15 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional getConsistentReadVersion( DatabaseContext *cx, uint32_t transactionCount, uint32_t flags, Optional debugID ) { try { ++cx->transactionReadVersionBatches; - if( debugID.present() ) { + if( debugID.present() ) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETCONSISTENTREADVERSION_BEFORE))); - } loop { state GetReadVersionRequest req( transactionCount, flags, debugID ); choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} when ( GetReadVersionReply v = wait( loadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) { - if( debugID.present() ) { + if( debugID.present() ) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), - TransactionDebugTrace::NATIVEAPI_GETCONSISTENTREADVERSION_AFTER))); - } ASSERT( v.version > 0 ); cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version); return v; diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index 09f0d66774..598bcc0f57 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -25,7 +25,6 @@ #include "fdbserver/CoroFlow.h" #include "fdbserver/Knobs.h" #include "flow/Hash3.h" -#include "flow/FBTrace.h" extern "C" { #include "fdbserver/sqlite/sqliteInt.h" @@ -1531,22 +1530,12 @@ private: }; void action( ReadValueAction& rv ) { //double t = timer(); - if (rv.debugID.present()) { - g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), - GetValueDebugTrace::READER_GETVALUE_BEFORE))); - } + if (rv.debugID.present()) g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); rv.result.send( getCursor()->get().get(rv.key) ); ++counter; - if (rv.debugID.present()) { - g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), - GetValueDebugTrace::READER_GETVALUE_AFTER))); - } + if (rv.debugID.present()) g_traceBatch.addEvent("GetValueDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); //t = timer()-t; //if (t >= 1.0) TraceEvent("ReadValueActionSlow",dbgid).detail("Elapsed", t); } @@ -1561,22 +1550,12 @@ private: }; void action( ReadValuePrefixAction& rv ) { //double t = timer(); - if (rv.debugID.present()) { - g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), - GetValueDebugTrace::READER_GETVALUEPREFIX_BEFORE))); - } + if (rv.debugID.present()) g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); rv.result.send( getCursor()->get().getPrefix(rv.key, rv.maxLength) ); ++counter; - if (rv.debugID.present()) { - g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(rv.debugID.get().first(), now(), - GetValueDebugTrace::READER_GETVALUEPREFIX_AFTER))); - } + if (rv.debugID.present()) g_traceBatch.addEvent("GetValuePrefixDebug", rv.debugID.get().first(), "Reader.After"); //.detail("TaskID", g_network->getCurrentTask()); //t = timer()-t; //if (t >= 1.0) TraceEvent("ReadValuePrefixActionSlow",dbgid).detail("Elapsed", t); } diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index ac12d8ff2d..af9823a1fe 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -45,7 +45,6 @@ #include "flow/ActorCollection.h" #include "flow/Knobs.h" #include "flow/Stats.h" -#include "flow/FBTrace.h" #include "flow/TDMetric.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. @@ -247,12 +246,8 @@ ACTOR Future queueTransactionStartRequests( req.reply.send(rep); TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceeded").suppressFor(60); } else { - if (req.debugID.present()) { + if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.queueTransactionStartRequests.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::MASTERPROXYSERVER_QUEUETRANSACTIONSTARTREQUESTS_BEFORE))); - } if (systemQueue->empty() && defaultQueue->empty() && batchQueue->empty()) { forwardPromise(GRVTimer, delayJittered(std::max(0.0, *GRVBatchTime - (now() - *lastGRVTime)), TaskPriority::ProxyGRVTimer)); @@ -556,9 +551,6 @@ ACTOR Future commitBatcher(ProxyCommitData *commitData, PromiseStream(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_BATCHER))); } if(!batch.size()) { @@ -747,12 +739,8 @@ ACTOR Future commitBatch( TraceEvent("SecondCommitBatch", self->dbgid).detail("DebugID", debugID.get()); } - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.Before"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_BEFORE))); - } /////// Phase 1: Pre-resolution processing (CPU bound except waiting for a version # which is separately pipelined and *should* be available by now (unless empty commit); ordered; currently atomic but could yield) @@ -761,12 +749,8 @@ ACTOR Future commitBatch( wait(self->latestLocalCommitBatchResolving.whenAtLeast(localBatchNumber-1)); state Future releaseDelay = delay(std::min(SERVER_KNOBS->MAX_PROXY_COMPUTE, batchOperations*self->commitComputePerOperation[latencyBucket]), TaskPriority::ProxyMasterVersionReply); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GettingCommitVersion"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_GETTINGCOMMITVERSION))); - } GetCommitVersionRequest req(self->commitVersionRequestNumber++, self->mostRecentProcessedRequestNumber, self->dbgid); GetCommitVersionReply versionReply = wait( brokenPromiseToNever(self->master.getCommitVersion.getReply(req, TaskPriority::ProxyMasterVersionReply)) ); @@ -786,12 +770,8 @@ ACTOR Future commitBatch( //TraceEvent("ProxyGotVer", self->dbgid).detail("Commit", commitVersion).detail("Prev", prevVersion); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.GotCommitVersion"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_GOTCOMMITVERSION))); - } ResolutionRequestBuilder requests( self, commitVersion, prevVersion, self->version ); int conflictRangeCount = 0; @@ -823,12 +803,8 @@ ACTOR Future commitBatch( /////// Phase 2: Resolution (waiting on the network; pipelined) state vector resolution = wait( getAll(replies) ); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterResolution"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERRESOLUTION))); - } ////// Phase 3: Post-resolution processing (CPU bound except for very rare situations; ordered; currently atomic but doesn't need to be) TEST(self->latestLocalCommitBatchLogging.get() < localBatchNumber - 1); // Queuing post-resolution commit processing @@ -839,12 +815,8 @@ ACTOR Future commitBatch( state double computeDuration = 0; self->stats.txnCommitResolved += trs.size(); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.ProcessingMutations"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_PROCESSINGMUTATIONS))); - } state Arena arena; state bool isMyFirstBatch = !self->version; @@ -1141,12 +1113,8 @@ ACTOR Future commitBatch( state LogSystemDiskQueueAdapter::CommitMessage msg = storeCommits.back().first.get(); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterStoreCommits"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERSTORECOMMITS))); - } // txnState (transaction subsystem state) tag: message extracted from log adapter bool firstMessage = true; @@ -1220,12 +1188,8 @@ ACTOR Future commitBatch( debug_advanceMinCommittedVersion(UID(), commitVersion); //TraceEvent("ProxyPushed", self->dbgid).detail("PrevVersion", prevVersion).detail("Version", commitVersion); - if (debugID.present()) { + if (debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "MasterProxyServer.commitBatch.AfterLogPush"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(debugID.get().first(), now(), - CommitDebugTrace::MASTERPROXYSERVER_COMMITBATCH_AFTERLOGPUSH))); - } for (auto &p : storeCommits) { ASSERT(!p.second.isReady()); @@ -1352,9 +1316,6 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.getLiveCommittedVersion.confirmEpochLive"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), - TransactionDebugTrace::MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_CONFIRMEPOCHLIVE))); } vector versions = wait(getAll(proxyVersions)); @@ -1371,9 +1332,6 @@ ACTOR Future getLiveCommittedVersion(ProxyCommitData* commi if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.getLiveCommittedVersion.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), - TransactionDebugTrace::MASTERPROXYSERVER_GETLIVECOMMITTEDVERSION_AFTER))); } commitData->stats.txnStartOut += transactionCount; @@ -1535,9 +1493,6 @@ ACTOR static Future transactionStarter( if (debugID.present()) { g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "MasterProxyServer.masterProxyServerCore.Broadcast"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.get().first(), now(), - TransactionDebugTrace::MASTERPROXYSERVER_MASTERPROXYSERVERCORE_BROADCAST))); } for (int i = 0; i < start.size(); i++) { @@ -1984,12 +1939,8 @@ ACTOR Future masterProxyServerCore( } when(GetRawCommittedVersionRequest req = waitNext(proxy.getRawCommittedVersion.getFuture())) { //TraceEvent("ProxyGetRCV", proxy.id()); - if (req.debugID.present()) { + if (req.debugID.present()) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "MasterProxyServer.masterProxyServerCore.GetRawCommittedVersion"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::MASTERPROXYSERVER_MASTERPROXYSERVERCORE_GETRAWCOMMITTEDVERSION))); - } GetReadVersionReply rep; rep.locked = commitData.locked; rep.metadataVersion = commitData.metadataVersion; diff --git a/fdbserver/Resolver.actor.cpp b/fdbserver/Resolver.actor.cpp index 34bd28f5e3..64b2959c63 100644 --- a/fdbserver/Resolver.actor.cpp +++ b/fdbserver/Resolver.actor.cpp @@ -30,7 +30,6 @@ #include "fdbserver/ConflictSet.h" #include "fdbserver/StorageMetrics.h" #include "fdbclient/SystemData.h" -#include "flow/FBTrace.h" #include "flow/actorcompiler.h" // This must be the last #include. namespace { @@ -116,9 +115,6 @@ ACTOR Future resolveBatch( debugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), debugID.get().first()); g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.Before"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::RESOLVER_RESOLVEBATCH_BEFORE))); } /*TraceEvent("ResolveBatchStart", self->dbgid).detail("From", proxyAddress).detail("Version", req.version).detail("PrevVersion", req.prevVersion).detail("StateTransactions", req.txnStateTransactions.size()) @@ -136,9 +132,6 @@ ACTOR Future resolveBatch( if(debugID.present()) { g_traceBatch.addEvent("CommitDebug",debugID.get().first(),"Resolver.resolveBatch.AfterQueueSizeCheck"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTERQUEUESIZECHECK))); } loop { @@ -171,12 +164,8 @@ ACTOR Future resolveBatch( Version firstUnseenVersion = proxyInfo.lastVersion + 1; proxyInfo.lastVersion = req.version; - if(req.debugID.present()) { + if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.AfterOrderer"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTERORDERER))); - } ResolveTransactionBatchReply& reply = proxyInfo.outstandingBatches[req.version]; @@ -287,12 +276,8 @@ ACTOR Future resolveBatch( self->checkNeededVersion.trigger(); } - if(req.debugID.present()) { + if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", debugID.get().first(), "Resolver.resolveBatch.After"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::RESOLVER_RESOLVEBATCH_AFTER))); - } } else { TEST(true); // Duplicate resolve batch request diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 37c118f548..5e2471cb9a 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -1 +1,1011 @@ +/* + * StorageCache.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/Knobs.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/VersionedMap.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbclient/Atomic.h" +#include "fdbclient/Notified.h" +#include "fdbserver/LogSystem.h" +#include "fdbserver/WaitFailure.h" +#include "fdbserver/WorkerInterface.actor.h" +#include "flow/FBTrace.h" +#include "flow/actorcompiler.h" // This must be the last #include. + + +//TODO storageCache server shares quite a bit of storageServer functionality, although simplified +// Need to look into refactoring common code out for better code readability and to avoid duplication + +//TODO rename wrong_shard_server error to wrong_cache_server +inline bool canReplyWith(Error e) { + switch(e.code()) { + case error_code_transaction_too_old: + case error_code_future_version: + case error_code_wrong_shard_server: + case error_code_process_behind: + //case error_code_all_alternatives_failed: + return true; + default: + return false; + }; +} + +const int VERSION_OVERHEAD = 64 + sizeof(Version) + sizeof(Standalone) + //mutationLog, 64b overhead for map + 2 * (64 + sizeof(Version) + sizeof(Reference::PTreeT>)); //versioned map [ x2 for createNewVersion(version+1) ], 64b overhead for map +static int mvccStorageBytes( MutationRef const& m ) { return VersionedMap::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2; } + +struct StorageCacheData { + typedef VersionedMap VersionedData; +private: + // in-memory versioned struct (PTree as of now. Subject to change) + VersionedData versionedData; + // in-memory mutationLog that the versionedData contains references to + // TODO change it to a deque, already contains mutations in version order + std::map> mutationLog; // versions (durableVersion, version] + +public: + UID thisServerID; // unique id + uint16_t index; // server index + Reference>> logSystem; + Key ck; //cacheKey + KeyRangeMap cachedRangeMap; // map of cached key-ranges + + // The following are in rough order from newest to oldest + // TODO double check which ones we need for storageCache servers + Version lastTLogVersion, lastVersionWithData; + NotifiedVersion version; // current version i.e. the max version that can be read from the cache + NotifiedVersion desiredOldestVersion; // oldestVersion can be increased to this after compaction + NotifiedVersion oldestVersion; // Min version that might be read from the cache + + // TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + Future compactionInProgress; + + // TODO do we need otherError here? + Promise otherError; + + int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this cache server + bool behind; + + // TODO double check which ones we need for storageCache servers + struct Counters { + CounterCollection cc; + Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, rowsQueried, bytesQueried, watchQueries; + Counter bytesInput, mutationBytes; // Like bytesInput but without MVCC accounting + Counter mutations, setMutations, clearRangeMutations, atomicMutations; + Counter updateBatches, updateVersions; + Counter loops; + Counter readsRejected; + + //LatencyBands readLatencyBands; + + Counters(StorageCacheData* self) + : cc("StorageCacheServer", self->thisServerID.toString()), + getKeyQueries("GetKeyQueries", cc), + getValueQueries("GetValueQueries",cc), + getRangeQueries("GetRangeQueries", cc), + allQueries("QueryQueue", cc), + finishedQueries("FinishedQueries", cc), + rowsQueried("RowsQueried", cc), + bytesQueried("BytesQueried", cc), + watchQueries("WatchQueries", cc), + bytesInput("BytesInput", cc), + mutationBytes("MutationBytes", cc), + mutations("Mutations", cc), + setMutations("SetMutations", cc), + clearRangeMutations("ClearRangeMutations", cc), + atomicMutations("AtomicMutations", cc), + updateBatches("UpdateBatches", cc), + updateVersions("UpdateVersions", cc), + loops("Loops", cc), + readsRejected("ReadsRejected", cc) + { + specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; }); + specialCounter(cc, "Version", [self](){ return self->version.get(); }); + specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); + } + } counters; + + explicit StorageCacheData(UID thisServerID, uint16_t index) + : thisServerID(thisServerID), index(index), + logSystem(new AsyncVar>()), + lastTLogVersion(0), lastVersionWithData(0), + compactionInProgress(Void()), + versionLag(0), behind(false), counters(this) + { + version.initMetric(LiteralStringRef("StorageCacheData.Version"), counters.cc.id); + desiredOldestVersion.initMetric(LiteralStringRef("StorageCacheData.DesriedOldestVersion"), counters.cc.id); + oldestVersion.initMetric(LiteralStringRef("StorageCacheData.OldestVersion"), counters.cc.id); + } + + void addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation); + + bool isReadable( KeyRangeRef const& keys ) { + auto cr = cachedRangeMap.intersectingRanges(keys); + for(auto i = cr.begin(); i != cr.end(); ++i) + if (!i->value()) + return false; + return true; + } + + Arena lastArena; + std::map> const & getMutationLog() { return mutationLog; } + std::map>& getMutableMutationLog() { return mutationLog; } + VersionedData const& data() const { return versionedData; } + VersionedData& mutableData() { return versionedData; } + + Standalone& addVersionToMutationLog(Version v) { + // return existing version... + auto m = mutationLog.find(v); + if (m != mutationLog.end()) + return m->second; + + // ...or create a new one + auto& u = mutationLog[v]; + u.version = v; + if (lastArena.getSize() >= 65536) lastArena = Arena(4096); + u.arena() = lastArena; + counters.bytesInput += VERSION_OVERHEAD; + return u; + } + + MutationRef addMutationToMutationLog(Standalone &mLV, MutationRef const& m){ + //TODO find out more + //byteSampleApplyMutation(m, mLV.version); + counters.bytesInput += mvccStorageBytes(m); + return mLV.mutations.push_back_deep( mLV.arena(), m ); + } + +}; + +///////////////////////////////////// Queries ///////////////////////////////// +#pragma region Queries +ACTOR Future waitForVersion( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version < data->oldestVersion.get() || version <= 0) throw transaction_too_old(); + else if (version <= data->version.get()) + return version; + + if(data->behind && version > data->version.get()) { + throw process_behind(); + } + + if(deterministicRandom()->random01() < 0.001) + TraceEvent("WaitForVersion1000x"); + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + //FIXME: A bunch of these can block with or without the following delay 0. + //wait( delay(0) ); // don't do a whole bunch of these at once + if (version < data->oldestVersion.get()) throw transaction_too_old(); + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version <= data->version.get()) + return version; + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { + state int64_t resultSize = 0; + try { + ++data->counters.getValueQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //TODO later + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) // so we need to downgrade here + + //TODO what's this? + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + if( req.debugID.present() ) { + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + //FIXME + fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ))); + } + + state Optional v; + state Version version = wait( waitForVersion( data, req.version ) ); + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + + if (!data->cachedRangeMap[req.key]) { + //TraceEvent("WrongCacheServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); + throw wrong_shard_server(); + } + + state int path = 0; + auto i = data->data().at(version).lastLessOrEqual(req.key); + if (i && i->isValue() && i.key() == req.key) { + v = (Value)i->getValue(); + path = 1; + } + + //debugMutation("CacheGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present()?v.get():LiteralStringRef(""))); + //debugMutation("CacheGetPath", version, MutationRef(MutationRef::DebugKey, req.key, path==0?LiteralStringRef("0"):path==1?LiteralStringRef("1"):LiteralStringRef("2"))); + + if (v.present()) { + ++data->counters.rowsQueried; + resultSize = v.get().size(); + data->counters.bytesQueried += resultSize; + } + + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + + GetValueReply reply(v); + req.reply.send(reply); + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + //--data->readQueueSizeMetric; + //if(data->latencyBandConfig.present()) { + // int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits::max()); + // data->counters.readLatencyBands.addMeasurement(timer() - req.requestTime(), resultSize > maxReadBytes); + //} + + return Void(); +}; + +//TODO Implement the reverse readRange +GetKeyValuesReply readRange(StorageCacheData* data, Version version, KeyRangeRef range, int limit, int* pLimitBytes) { + GetKeyValuesReply result; + StorageCacheData::VersionedData::ViewAtVersion view = data->data().at(version); + StorageCacheData::VersionedData::iterator vCurrent = view.end(); + KeyRef readBegin; + KeyRef rangeBegin = range.begin; + KeyRef rangeEnd = range.end; + + //We might care about a clear beginning before start that runs into range + vCurrent = view.lastLessOrEqual(rangeBegin); + if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > rangeBegin) + readBegin = vCurrent->getEndKey(); + else + readBegin = rangeBegin; + + vCurrent = view.lower_bound(readBegin); + ASSERT(!vCurrent || vCurrent.key() >= readBegin); + if (vCurrent) { + auto b = vCurrent; + --b; + ASSERT(!b || b.key() < readBegin); + } + int accumulatedBytes = 0; + while (vCurrent && vCurrent.key() < rangeEnd && limit > 0 && accumulatedBytes < *pLimitBytes) { + if (!vCurrent->isClearTo()) { + result.data.push_back_deep(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue())); + accumulatedBytes += sizeof(KeyValueRef) + result.data.end()[-1].expectedSize(); + --limit; + } + ++vCurrent; + } + + *pLimitBytes -= accumulatedBytes; + ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0); + result.more = limit == 0 || *pLimitBytes <= 0; // FIXME: Does this have to be exact? + result.version = version; + return result; +} + +Key findKey( StorageCacheData* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) +// Attempts to find the key indicated by sel in the data at version, within range. +// Precondition: selectorInRange(sel, range) +// If it is found, offset is set to 0 and a key is returned which falls inside range. +// If the search would depend on any key outside range OR if the key selector offset is too large (range read returns too many bytes), it returns either +// a negative offset and a key in [range.begin, sel.getKey()], indicating the key is (the first key <= returned key) + offset, or +// a positive offset and a key in (sel.getKey(), range.end], indicating the key is (the first key >= returned key) + offset-1 +// The range passed in to this function should specify a shard. If range.begin is repeatedly not the beginning of a shard, then it is possible to get stuck looping here +{ + ASSERT( version != latestVersion ); + ASSERT( selectorInRange(sel, range) && version >= data->oldestVersion.get()); + + // Count forward or backward distance items, skipping the first one if it == key and skipEqualKey + bool forward = sel.offset > 0; // If forward, result >= sel.getKey(); else result <= sel.getKey() + int sign = forward ? +1 : -1; + bool skipEqualKey = sel.orEqual == forward; + int distance = forward ? sel.offset : 1-sel.offset; + + //Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case) + int maxBytes; + if (sel.offset <= 1 && sel.offset >= 0) + maxBytes = std::numeric_limits::max(); + else + maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES; + + GetKeyValuesReply rep = readRange( data, version, + forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())), + (distance + skipEqualKey)*sign, &maxBytes ); + bool more = rep.more && rep.data.size() != distance + skipEqualKey; + + //If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop + if(more && !forward && rep.data.size() == 1) { + TEST(true); //Reverse key selector returned only one result in range read + maxBytes = std::numeric_limits::max(); + GetKeyValuesReply rep2 = readRange( data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes ); + rep = rep2; + more = rep.more && rep.data.size() != distance + skipEqualKey; + ASSERT(rep.data.size() == 2 || !more); + } + + int index = distance-1; + if (skipEqualKey && rep.data.size() && rep.data[0].key == sel.getKey() ) + ++index; + + if (index < rep.data.size()) { + *pOffset = 0; + return rep.data[ index ].key; + } else { + // FIXME: If range.begin=="" && !forward, return success? + *pOffset = index - rep.data.size() + 1; + if (!forward) *pOffset = -*pOffset; + + if (more) { + TEST(true); // Key selector read range had more results + + ASSERT(rep.data.size()); + Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key; + + //This is possible if key/value pairs are very large and only one result is returned on a last less than query + //SOMEDAY: graceful handling of exceptionally sized values + ASSERT(returnKey != sel.getKey()); + + return returnKey; + } else + return forward ? range.end : range.begin; + } +} + +KeyRange getCachedKeyRange( StorageCacheData* data, const KeySelectorRef& sel ) +// Returns largest range that is cached on this server and selectorInRange(sel, range) or wrong_shard_server if no such range exists +{ + auto i = sel.isBackward() ? data->cachedRangeMap.rangeContainingKeyBefore( sel.getKey() ) : + data->cachedRangeMap.rangeContaining( sel.getKey() ); + if (!i->value()) throw wrong_shard_server(); + ASSERT( selectorInRange(sel, i->range()) ); + return i->range(); +} + +ACTOR Future getKeyValues( StorageCacheData* data, GetKeyValuesRequest req ) +// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents +// all data from being read in one range read +{ + state int64_t resultSize = 0; + + ++data->counters.getRangeQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + TaskPriority taskType = TaskPriority::DefaultEndpoint; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { + taskType = TaskPriority::FetchKeys; + // } else if (false) { + // // Placeholder for up-prioritizing fetches for important requests + // taskType = TaskPriority::DefaultDelay; + } + wait( delay(0, taskType) ); + + try { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Before"); + state Version version = wait( waitForVersion( data, req.version ) ); + + state KeyRange cachedKeyRange; + try { + cachedKeyRange = getCachedKeyRange(data, req.begin); + + if (req.debugID.present()) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), + "storagecache.getKeyValues.AfterVersion"); + //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); + } catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } + + if ( !selectorInRange(req.end, cachedKeyRange) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == cachedKeyRange.end) ) { +// TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents"); + throw wrong_shard_server(); + } + + state int offset1; + state int offset2; + state Key begin = req.begin.isFirstGreaterOrEqual() ? req.begin.getKey() : findKey( data, req.begin, version, cachedKeyRange, &offset1 ); + state Key end = req.end.isFirstGreaterOrEqual() ? req.end.getKey() : findKey( data, req.end, version, cachedKeyRange, &offset2 ); + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterKeys"); + //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); + + // Offsets of zero indicate begin/end keys in this cachedKeyRange, which obviously means we can answer the query + // An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. + // A begin offset of 1 is also OK because then either begin is past end or equal to end (so the result is definitely empty) + if ((offset1 && offset1!=1) || (offset2 && offset2!=1)) { + TEST(true); // wrong_cache_server due to offset + // We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, and return a clipped range rather + // than an error (since that is what the NativeAPI.getRange will do anyway via its "slow path"), but we would have to add some flags to the response + // to encode whether we went off the beginning and the end, since it needs that information. + //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); + throw wrong_shard_server(); + } + + if (begin >= end) { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Send"); + //.detail("Begin",begin).detail("End",end); + + GetKeyValuesReply none; + none.version = version; + none.more = false; + req.reply.send( none ); + } else { + state int remainingLimitBytes = req.limitBytes; + + GetKeyValuesReply _r = readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes); + GetKeyValuesReply r = _r; + + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterReadRange"); + //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); + if (EXPENSIVE_VALIDATION) { + for (int i = 0; i < r.data.size(); i++) + ASSERT(r.data[i].key >= begin && r.data[i].key < end); + ASSERT(r.data.size() <= std::abs(req.limit)); + } + + req.reply.send( r ); + + resultSize = req.limitBytes - remainingLimitBytes; + data->counters.bytesQueried += resultSize; + data->counters.rowsQueried += r.data.size(); + } + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +ACTOR Future getKey( StorageCacheData* data, GetKeyRequest req ) { + state int64_t resultSize = 0; + + ++data->counters.getKeyQueries; + ++data->counters.allQueries; + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + try { + state Version version = wait( waitForVersion( data, req.version ) ); + state KeyRange cachedKeyRange = getCachedKeyRange( data, req.sel ); + + state int offset; + Key k = findKey( data, req.sel, version, cachedKeyRange, &offset ); + + KeySelector updated; + if (offset < 0) + updated = firstGreaterOrEqual(k)+offset; // first thing on this shard OR (large offset case) smallest key retrieved in range read + else if (offset > 0) + updated = firstGreaterOrEqual(k)+offset-1; // first thing on next shard OR (large offset case) keyAfter largest key retrieved in range read + else + updated = KeySelectorRef(k,true,0); //found + + resultSize = k.size(); + data->counters.bytesQueried += resultSize; + ++data->counters.rowsQueried; + + GetKeyReply reply(updated); + req.reply.send(reply); + } + catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) TraceEvent("WrongShardServer").detail("In","getKey"); + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +#pragma endregion + +bool expandMutation( MutationRef& m, StorageCacheData::VersionedData const& data, KeyRef eagerTrustedEnd, Arena& ar ) { + // After this function call, m should be copied into an arena immediately (before modifying data, shards, or eager) + if (m.type == MutationRef::ClearRange) { + // Expand the clear + const auto& d = data.atLatest(); + + // If another clear overlaps the beginning of this one, engulf it + auto i = d.lastLess(m.param1); + if (i && i->isClearTo() && i->getEndKey() >= m.param1) + m.param1 = i.key(); + + // If another clear overlaps the end of this one, engulf it; otherwise expand + i = d.lastLessOrEqual(m.param2); + if (i && i->isClearTo() && i->getEndKey() >= m.param2) { + m.param2 = i->getEndKey(); + } else { + // Expand to the next set or clear (from storage or latestVersion), and if it + // is a clear, engulf it as well + i = d.lower_bound(m.param2); + //KeyRef endKeyAtStorageVersion = m.param2 == eagerTrustedEnd ? eagerTrustedEnd : std::min( eager->getKeyEnd( m.param2 ), eagerTrustedEnd ); + // TODO check if the following is correct + KeyRef endKeyAtStorageVersion = eagerTrustedEnd; + if (!i || endKeyAtStorageVersion < i.key()) + m.param2 = endKeyAtStorageVersion; + else if (i->isClearTo()) + m.param2 = i->getEndKey(); + else + m.param2 = i.key(); + } + } + else if (m.type != MutationRef::SetValue && (m.type)) { + + Optional oldVal; + auto it = data.atLatest().lastLessOrEqual(m.param1); + if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1) + oldVal = it->getValue(); + else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) { + TEST(true); // Atomic op right after a clear. + } + + switch(m.type) { + case MutationRef::AddValue: + m.param2 = doLittleEndianAdd(oldVal, m.param2, ar); + break; + case MutationRef::And: + m.param2 = doAnd(oldVal, m.param2, ar); + break; + case MutationRef::Or: + m.param2 = doOr(oldVal, m.param2, ar); + break; + case MutationRef::Xor: + m.param2 = doXor(oldVal, m.param2, ar); + break; + case MutationRef::AppendIfFits: + m.param2 = doAppendIfFits(oldVal, m.param2, ar); + break; + case MutationRef::Max: + m.param2 = doMax(oldVal, m.param2, ar); + break; + case MutationRef::Min: + m.param2 = doMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMin: + m.param2 = doByteMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMax: + m.param2 = doByteMax(oldVal, m.param2, ar); + break; + case MutationRef::MinV2: + m.param2 = doMinV2(oldVal, m.param2, ar); + break; + case MutationRef::AndV2: + m.param2 = doAndV2(oldVal, m.param2, ar); + break; + case MutationRef::CompareAndClear: + if (oldVal.present() && m.param2 == oldVal.get()) { + m.type = MutationRef::ClearRange; + m.param2 = keyAfter(m.param1, ar); + return expandMutation(m, data, eagerTrustedEnd, ar); + } + return false; + } + m.type = MutationRef::SetValue; + } + + return true; +} + +// Applies a write mutation (SetValue or ClearRange) to the in-memory versioned data structure +void applyMutation( StorageCacheData *self, MutationRef const& m, Arena& arena, StorageCacheData::VersionedData &data ) { + // m is expected to be in arena already + // Clear split keys are added to arena + + if (m.type == MutationRef::SetValue) { + auto prev = data.atLatest().lastLessOrEqual(m.param1); + if (prev && prev->isClearTo() && prev->getEndKey() > m.param1) { + ASSERT( prev.key() <= m.param1 ); + KeyRef end = prev->getEndKey(); + // TODO double check if the insert version of the previous clear needs to be preserved for the "left half", + // insert() invalidates prev, so prev.key() is not safe to pass to it by reference + data.insert( KeyRef(prev.key()), ValueOrClearToRef::clearTo( m.param1 ), prev.insertVersion() ); // overwritten by below insert if empty + KeyRef nextKey = keyAfter(m.param1, arena); + if ( end != nextKey ) { + ASSERT( end > nextKey ); + // TODO double check if it's okay to let go of the the insert version of the "right half" + // FIXME: This copy is technically an asymptotic problem, definitely a waste of memory (copy of keyAfter is a waste, but not asymptotic) + data.insert( nextKey, ValueOrClearToRef::clearTo( KeyRef(arena, end) ) ); + } + } + data.insert( m.param1, ValueOrClearToRef::value(m.param2) ); + } else if (m.type == MutationRef::ClearRange) { + data.erase( m.param1, m.param2 ); + ASSERT( m.param2 > m.param1 ); + ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); + data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); + } +} + +template +void splitMutation(StorageCacheData* data, KeyRangeMap& map, MutationRef const& m, Version ver) { + if(isSingleKeyMutation((MutationRef::Type) m.type)) { + auto i = map.rangeContaining(m.param1); + if (i->value()) // If this key lies in the cached key-range on this server + data->addMutation( i->range(), ver, m ); + } + else if (m.type == MutationRef::ClearRange) { + KeyRangeRef mKeys( m.param1, m.param2 ); + auto r = map.intersectingRanges( mKeys ); + for(auto i = r.begin(); i != r.end(); ++i) { + if (i->value()) { // if this sub-range exists on this cache server + KeyRangeRef k = mKeys & i->range(); + data->addMutation( i->range(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) ); + } + } + } else + ASSERT(false); // Unknown mutation type in splitMutations +} + +void StorageCacheData::addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation) { + MutationRef expanded = mutation; + auto& mLog = addVersionToMutationLog(version); + + if ( !expandMutation( expanded, data(), cachedKeyRange.end, mLog.arena()) ) { + return; + } + expanded = addMutationToMutationLog(mLog, expanded); + if (debugMutation("expandedMutation", version, expanded)) { + const char* type = + mutation.type == MutationRef::SetValue ? "SetValue" : + mutation.type == MutationRef::ClearRange ? "ClearRange" : + mutation.type == MutationRef::DebugKeyRange ? "DebugKeyRange" : + mutation.type == MutationRef::DebugKey ? "DebugKey" : + "UnknownMutation"; + printf("DEBUGMUTATION:\t%.6f\t%s\t%s\t%s\t%s\t%s\n", + now(), g_network->getLocalAddress().toString().c_str(), "originalMutation", + type, printable(mutation.param1).c_str(), printable(mutation.param2).c_str()); + printf(" Cached Key-range: %s - %s\n", printable(cachedKeyRange.begin).c_str(), printable(cachedKeyRange.end).c_str()); + } + applyMutation( this, expanded, mLog.arena(), mutableData() ); + printf("\nSCUpdate: Printing versioned tree after applying mutation\n"); + mutableData().printTree(version); + +} + +// Helper class for updating the storage cache (i.e. applying mutations) +class StorageCacheUpdater { +public: + StorageCacheUpdater() : currentVersion(invalidVersion), processedCacheStartKey(false) {} + StorageCacheUpdater(Version currentVersion) : currentVersion(currentVersion), processedCacheStartKey(false) {} + + void applyMutation(StorageCacheData* data, MutationRef const& m , Version ver) { + //TraceEvent("SCNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); + + if(currentVersion != ver) { + currentVersion = ver; + data->mutableData().createNewVersion(ver); + } + + if (m.param1.startsWith( systemKeys.end )) { + //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); + applyPrivateCacheData( data, m ); + } else { + // FIXME: enable when debugMutation is active + //for(auto m = changes[c].mutations.begin(); m; ++m) { + // debugMutation("SCUpdateMutation", changes[c].version, *m); + //} + + splitMutation(data, data->cachedRangeMap, m, ver); + } + + //TODO + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + } + + Version currentVersion; +private: + KeyRef cacheStartKey; + bool nowAssigned; + bool processedCacheStartKey; + + // Applies private mutations, as the name suggests. It's basically establishes the key-ranges + //that this cache server is responsible for + // TODO Revisit during failure handling. Might we loose some private mutations? + void applyPrivateCacheData( StorageCacheData* data, MutationRef const& m ) { + TraceEvent(SevDebug, "SCPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); + + if (processedCacheStartKey) { + // we expect changes in pairs, [begin,end). This mutation is for end key of the range + ASSERT (m.type == MutationRef::SetValue && m.param1.startsWith(data->ck)); + KeyRangeRef keys( cacheStartKey.removePrefix(data->ck), m.param1.removePrefix(data->ck)); + data->cachedRangeMap.insert(keys, true); + fprintf(stderr, "SCPrivateCacheMutation: begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); + + processedCacheStartKey = false; + } else if (m.type == MutationRef::SetValue && m.param1.startsWith( data->ck )) { + // We expect changes in pairs, [begin,end), This mutation is for start key of the range + cacheStartKey = m.param1; + processedCacheStartKey = true; + } else { + fprintf(stderr, "SCPrivateCacheMutation: Unknown private mutation\n"); + ASSERT(false); // Unknown private mutation + } + } +}; + +// Compacts the in-memory VersionedMap, i.e. removes versions below the desiredOldestVersion +// TODO revisit if we change the data structure +ACTOR Future compactCache(StorageCacheData* data) { + loop { + //TODO understand this, should we add delay here? + //if (g_network->isSimulated()) { + // double endTime = g_simulator.checkDisabled(format("%s/compactCache", data->thisServerID.toString().c_str())); + // if(endTime > now()) { + // wait(delay(endTime - now(), TaskPriority::CompactCache)); + // } + //} + + // Wait until the desiredOldestVersion is greater than the current oldestVersion + wait( data->desiredOldestVersion.whenAtLeast( data->oldestVersion.get()+1 ) ); + wait( delay(0, TaskPriority::CompactCache) ); + + //TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + state Promise compactionInProgress; + data->compactionInProgress = compactionInProgress.getFuture(); + state Version desiredVersion = data->desiredOldestVersion.get(); + // Call the compaction routine that does the actual work, + // TODO It's a synchronous function call as of now. Should it asynch? + data->mutableData().compact(desiredVersion); + Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( desiredVersion, + TaskPriority::CompactCache ); + data->oldestVersion.set( desiredVersion ); + wait( finishedForgetting ); + // TODO how do we yield here? This may not be enough, because compact() does the heavy lifting + // of compating the VersionedMap. We should probably look into per version compaction and then + // we can yield after compacting one version + wait( yield(TaskPriority::CompactCache) ); + + // TODO what flowlock to acquire during compaction? + compactionInProgress.send(Void()); + wait( delay(0, TaskPriority::CompactCache) ); //Setting compactionInProgess could cause the cache server to shut down, so delay to check for cancellation + } +} + +ACTOR Future pullAsyncData( StorageCacheData *data ) { + state Future dbInfoChange = Void(); + state Reference r; + state Version tagAt = 0; + + state StorageCacheUpdater updater(data->lastVersionWithData); + state Version ver = invalidVersion; + //data->lastTLogVersion = r->getMaxKnownVersion(); + //data->versionLag = std::max(0, data->lastTLogVersion - data->version.get()); + ++data->counters.updateBatches; + + loop { + loop { + choose { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { + break; + } + when( wait( dbInfoChange ) ) { + if( data->logSystem->get() ) + r = data->logSystem->get()->peek( data->thisServerID, tagAt, Optional(), cacheTag, true ); + else + r = Reference(); + dbInfoChange = data->logSystem->onChange(); + } + } + } + //FIXME: if the popped version is greater than our last version, we need to clear the cache + + //FIXME: ensure this can only read data from the current version + r->setProtocolVersion(currentProtocolVersion); + + // Now process the mutations + for (; r->hasMessage(); r->nextMessage()) { + ArenaReader& reader = *r->reader(); + + MutationRef msg; + reader >> msg; + fprintf(stderr, "%lld : %s\n", r->version().version, msg.toString().c_str()); + + if (r->version().version > ver && r->version().version > data->version.get()) { + ++data->counters.updateVersions; + ver = r->version().version; + } + if (ver != invalidVersion) // This change belongs to a version < minVersion + { + updater.applyMutation(data, msg, ver); + // TODO + //mutationBytes += msg.totalSize(); + data->counters.mutationBytes += msg.totalSize(); + ++data->counters.mutations; + switch(msg.type) { + case MutationRef::SetValue: + ++data->counters.setMutations; + break; + case MutationRef::ClearRange: + ++data->counters.clearRangeMutations; + break; + case MutationRef::AddValue: + case MutationRef::And: + case MutationRef::AndV2: + case MutationRef::AppendIfFits: + case MutationRef::ByteMax: + case MutationRef::ByteMin: + case MutationRef::Max: + case MutationRef::Min: + case MutationRef::MinV2: + case MutationRef::Or: + case MutationRef::Xor: + case MutationRef::CompareAndClear: + ++data->counters.atomicMutations; + break; + } + } + else + TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", r->version().toString()); + + tagAt = r->version().version + 1; + } + + if(ver != invalidVersion) { + data->lastVersionWithData = ver; + } else { + // TODO double check + ver = r->version().version - 1; + } + + if(ver != invalidVersion && ver > data->version.get()) { + debugKeyRange("SCUpdate", ver, allKeys); + + data->mutableData().createNewVersion(ver); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // TODO may enable these later + //data->noRecentUpdates.set(false); + //data->lastUpdate = now(); + data->version.set( ver ); // Triggers replies to waiting gets for new version(s) + // TODO double check + //setDataVersion(data->thisServerID, data->version.get()); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // we can get rid of versions beyond maxVerionsInMemory at any point. Update the + //desiredOldestVersion and that may invoke the compaction actor + Version maxVersionsInMemory = SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS; + Version proposedOldestVersion = data->version.get() - maxVersionsInMemory; + proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get()); + data->desiredOldestVersion.set(proposedOldestVersion); + } + + // TODO implement a validate function for the cache + //validate(data); + + if(r->version().version >= data->lastTLogVersion) { + if(data->behind) { + TraceEvent("StorageCacheNoLongerBehind", data->thisServerID).detail("CursorVersion", r->version().version).detail("TLogVersion", data->lastTLogVersion); + } + data->behind = false; + } + + tagAt = std::max( tagAt, r->version().version); + } +} + +ACTOR Future storageCache(StorageServerInterface ssi, uint16_t id, Reference> db) { + state StorageCacheData self(ssi.id(), id); + state ActorCollection actors(false); + state Future dbInfoChange = Void(); + + // This helps identify the private mutations meant for this cache server + self.ck = cacheKeysPrefixFor( id ).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/ + + actors.add(waitFailureServer(ssi.waitFailure.getFuture())); + + // compactCache actor will periodically compact the cache when certain version condityion is met + actors.add(compactCache(&self)); + + // pullAsyncData actor pulls mutations from the TLog and also applies them. + actors.add(pullAsyncData(&self)); + + loop { + ++self.counters.loops; + choose { + when( wait( dbInfoChange ) ) { + dbInfoChange = db->onChange(); + self.logSystem->set(ILogSystem::fromServerDBInfo( ssi.id(), db->get(), true )); + } + when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { + // TODO do we need to add throttling for cache servers? Probably not + //actors.add(self->readGuard(req , getValueQ)); + actors.add(getValueQ(&self, req)); + } + when( WatchValueRequest req = waitNext(ssi.watchValue.getFuture()) ) { + ASSERT(false); + } + when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { + actors.add(getKey(&self, req)); + } + when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { + actors.add(getKeyValues(&self, req)); + } + when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { + ASSERT(false); + } + when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { + ASSERT(false); + } + //when( ReplyPromise reply = waitNext(ssi.getVersion.getFuture()) ) { + // ASSERT(false); + //} + when( ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) { + ASSERT(false); + } + when(wait(actors.getResult())) {} + } + } +} diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index e1a26a3042..7ce0bb5d79 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -21,7 +21,6 @@ #include "flow/Hash3.h" #include "flow/Stats.h" #include "flow/UnitTest.h" -#include "flow/FBTrace.h" #include "fdbclient/NativeAPI.actor.h" #include "fdbclient/Notified.h" #include "fdbclient/KeyRangeMap.h" @@ -1767,9 +1766,6 @@ ACTOR Future tLogCommit( tlogDebugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("CommitAttachID", req.debugID.get().first(), tlogDebugID.get().first()); g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.BeforeWaitForVersion"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::TLOG_TLOGCOMMIT_BEFOREWAITFORVERSION))); } logData->minKnownCommittedVersion = std::max(logData->minKnownCommittedVersion, req.minKnownCommittedVersion); @@ -1799,12 +1795,8 @@ ACTOR Future tLogCommit( } if (logData->version.get() == req.prevVersion) { // Not a duplicate (check relies on critical section between here self->version.set() below!) - if(req.debugID.present()) { + if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.Before"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::TLOG_TLOGCOMMIT_BEFORE))); - } //TraceEvent("TLogCommit", logData->logId).detail("Version", req.version); commitMessages(self, logData, req.version, req.arena, req.messages); @@ -1827,12 +1819,8 @@ ACTOR Future tLogCommit( // Notifies the commitQueue actor to commit persistentQueue, and also unblocks tLogPeekMessages actors logData->version.set( req.version ); - if(req.debugID.present()) { + if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.AfterTLogCommit"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::TLOG_TLOGCOMMIT_AFTERTLOGCOMMIT))); - } } // Send replies only once all prior messages have been received and committed. state Future stopped = logData->stopCommit.onTrigger(); @@ -1844,12 +1832,8 @@ ACTOR Future tLogCommit( return Void(); } - if(req.debugID.present()) { + if(req.debugID.present()) g_traceBatch.addEvent("CommitDebug", tlogDebugID.get().first(), "TLog.tLogCommit.After"); - //FIXME - fbTrace(Reference(new CommitDebugTrace(req.debugID.get().first(), now(), - CommitDebugTrace::TLOG_TLOGCOMMIT_AFTER))); - } req.reply.send( logData->durableKnownCommittedVersion ); return Void(); @@ -2115,9 +2099,6 @@ ACTOR Future serveTLogInterface( TLogData* self, TLogInterface tli, Refere UID tlogDebugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach("TransactionAttachID", req.debugID.get().first(), tlogDebugID.first()); g_traceBatch.addEvent("TransactionDebug", tlogDebugID.first(), "TLogServer.TLogConfirmRunningRequest"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::TLOGSERVER_TLOGCONFIRMRUNNINGREQUEST))); } if (!logData->stopped) req.reply.send(Void()); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 59d337c449..e8a9bbca76 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -51,7 +51,6 @@ #include "fdbrpc/sim_validation.h" #include "fdbrpc/Smoother.h" #include "flow/Stats.h" -#include "flow/FBTrace.h" #include "flow/TDMetric.actor.h" #include #include "flow/actorcompiler.h" // This must be the last #include. @@ -839,21 +838,13 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // so we need to downgrade here wait( delay(0, TaskPriority::DefaultEndpoint) ); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), - GetValueDebugTrace::STORAGESERVER_GETVALUE_DOREAD))); - } state Optional v; state Version version = wait( waitForVersion( data, req.version ) ); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), - GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTERVERSION))); - } state uint64_t changeCounter = data->shardChangeCounter; @@ -906,12 +897,8 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { data->metrics.notifyBytesReadPerKSecond(req.key, bytesReadPerKSecond); } - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), - GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTERREAD))); - } GetValueReply reply(v); reply.penalty = data->getPenalty(); @@ -936,20 +923,12 @@ ACTOR Future watchValue_impl( StorageServer* data, WatchValueRequest req ) try { ++data->counters.watchQueries; - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.Before"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), - WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_BEFORE))); - } wait(success(waitForVersionNoTooOld(data, req.version))); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), - WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_AFTERVERSION))); - } loop { try { @@ -966,12 +945,8 @@ ACTOR Future watchValue_impl( StorageServer* data, WatchValueRequest req ) debugMutation("ShardWatchValue", latest, MutationRef(MutationRef::DebugKey, req.key, reply.value.present() ? StringRef( reply.value.get() ) : LiteralStringRef("") ) ); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new WatchValueDebugTrace(req.debugID.get().first(), now(), - WatchValueDebugTrace::STORAGESERVER_WATCHVALUE_AFTERREAD))); - } if( reply.value != req.value ) { req.reply.send(WatchValueReply{ latest }); @@ -1376,24 +1351,16 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) wait( delay(0, taskType) ); try { - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_BEFORE))); - } state Version version = wait( waitForVersion( data, req.version ) ); state uint64_t changeCounter = data->shardChangeCounter; // try { state KeyRange shard = getShardKeyRange( data, req.begin ); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterVersion"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERVERSION))); - } //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); //} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } @@ -1408,12 +1375,8 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) state Future fEnd = req.end.isFirstGreaterOrEqual() ? Future(req.end.getKey()) : findKey( data, req.end, version, shard, &offset2 ); state Key begin = wait(fBegin); state Key end = wait(fEnd); - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERKEYS))); - } //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); // Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query @@ -1429,12 +1392,8 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) } if (begin >= end) { - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_SEND))); - } //.detail("Begin",begin).detail("End",end); GetKeyValuesReply none; @@ -1450,12 +1409,8 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) GetKeyValuesReply _r = wait( readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes) ); GetKeyValuesReply r = _r; - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(req.debugID.get().first(), now(), - TransactionDebugTrace::STORAGESERVER_GETKEYVALUES_AFTERREADRANGE))); - } //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); data->checkChangeCounter( changeCounter, KeyRangeRef( std::min(begin, std::min(req.begin.getKey(), req.end.getKey())), std::max(end, std::max(req.begin.getKey(), req.end.getKey())) ) ); if (EXPENSIVE_VALIDATION) { @@ -3647,12 +3602,8 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac } when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { // Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work - if( req.debugID.present() ) { + if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask()); - //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), - GetValueDebugTrace::STORAGESERVER_GETVALUE_RECEIVED))); - } if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key)) req.reply.send(GetValueReply()); diff --git a/fdbserver/workloads/ReadWrite.actor.cpp b/fdbserver/workloads/ReadWrite.actor.cpp index ce9623a079..cd7cc918c3 100644 --- a/fdbserver/workloads/ReadWrite.actor.cpp +++ b/fdbserver/workloads/ReadWrite.actor.cpp @@ -31,7 +31,6 @@ #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbclient/ReadYourWrites.h" #include "flow/TDMetric.actor.h" -#include "flow/FBTrace.h" #include "flow/actorcompiler.h" // This must be the last #include. const int sampleSize = 10000; @@ -619,9 +618,6 @@ struct ReadWriteWorkload : KVWorkload { debugID = deterministicRandom()->randomUniqueID(); tr.debugTransaction(debugID); g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.Before"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.first(), now(), - TransactionDebugTrace::READWRITE_RANDOMREADWRITECLIENT_BEFORE))); } else { debugID = UID(); @@ -694,12 +690,8 @@ struct ReadWriteWorkload : KVWorkload { } } - if(debugID != UID()) { + if(debugID != UID()) g_traceBatch.addEvent("TransactionDebug", debugID.first(), "ReadWrite.randomReadWriteClient.After"); - //FIXME - fbTrace(Reference(new TransactionDebugTrace(debugID.first(), now(), - TransactionDebugTrace::READWRITE_RANDOMREADWRITECLIENT_AFTER))); - } tr = Trans(); diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 13404e2d47..3c76a108e0 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -44,20 +44,17 @@ public: constexpr static FileIdentifier file_identifier = 617894; enum codeLocation { STORAGESERVER_GETVALUE_RECEIVED = 0, - STORAGESERVER_GETVALUE_DOREAD = 1, - STORAGESERVER_GETVALUE_AFTERVERSION = 2, - STORAGESERVER_GETVALUE_AFTERREAD = 3, + STORAGESERVER_GETVALUE_DO_READ = 1, + STORAGESERVER_GETVALUE_AFTER_VERSION = 2, + STORAGESERVER_GETVALUE_AFTER_READ = 3, STORAGECACHE_GETVALUE_RECEIVED = 4, - STORAGECACHE_GETVALUE_DOREAD = 5, - STORAGECACHE_GETVALUE_AFTERVERSION = 6, - STORAGECACHE_GETVALUE_AFTERREAD = 7, + STORAGECACHE_GETVALUE_DO_READ = 5, + STORAGECACHE_GETVALUE_AFTER_VERSION = 6, + STORAGECACHE_GETVALUE_AFTER_READ = 7, READER_GETVALUE_BEFORE = 8, READER_GETVALUE_AFTER = 9, READER_GETVALUEPREFIX_BEFORE = 10, - READER_GETVALUEPREFIX_AFTER = 11, - NATIVEAPI_GETVALUE_BEFORE = 12, - NATIVEAPI_GETVALUE_AFTER = 13, - NATIVEAPI_GETVALUE_ERROR = 14 + READER_GETVALUEPREFIX_AFTER = 11 }; uint64_t id; @@ -77,10 +74,10 @@ public: constexpr static FileIdentifier file_identifier = 14486715; enum codeLocation { STORAGESERVER_WATCHVALUE_BEFORE = 1, - STORAGESERVER_WATCHVALUE_AFTERVERSION = 2, - STORAGESERVER_WATCHVALUE_AFTERREAD = 3, + STORAGESERVER_WATCHVALUE_AFTER_VERSION = 2, + STORAGESERVER_WATCHVALUE_AFTER_READ = 3, NATIVEAPI_WATCHVALUE_BEFORE = 4, - NATIVEAPI_WATCHVALUE_AFTER = 5 + NATIVEAPI_WATCHVALUE_AFTER_READ = 5 }; uint64_t id; @@ -102,9 +99,9 @@ public: STORAGESERVER_COMMIT_BEORE = 0, STORAGESERVER_COMMIT_AFTER_VERSION = 1, STORAGESERVER_COMMIT_AFTER_READ = 2, - NATIVEAPI_COMMIT_BEFORE = 3, + NATIVEAPI_COMMIT_BEORE = 3, NATIVEAPI_COMMIT_AFTER = 4, - MASTERPROXYSERVER_BATCHER = 5, + MASTERROXYSERVER_BATCHER = 5, MASTERPROXYSERVER_COMMITBATCH_BEFORE = 6, MASTERPROXYSERVER_COMMITBATCH_GETTINGCOMMITVERSION = 7, MASTERPROXYSERVER_COMMITBATCH_GOTCOMMITVERSION = 8, From 72028de14d618dd8c4eb40bd4508933f63649b51 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 17 Apr 2020 15:30:45 -0700 Subject: [PATCH 10/27] intermediate commit --- flow/CMakeLists.txt | 1 + flow/FBTrace.cpp | 144 ++++++++++++++++++++++++++++++++++++++++++++ flow/FBTrace.h | 13 +++- 3 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 flow/FBTrace.cpp diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index f50ee117e9..10c64d4992 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -21,6 +21,7 @@ set(FLOW_SRCS FaultInjection.cpp FaultInjection.h FBTrace.h + FBTrace.cpp FileTraceLogWriter.cpp FileTraceLogWriter.h Hash3.c diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp new file mode 100644 index 0000000000..0a31dc68da --- /dev/null +++ b/flow/FBTrace.cpp @@ -0,0 +1,144 @@ +/* + * FBTrace.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/FBTrace.h" +#include "FastAlloc.h" +#include "Platform.h" +#include +#include +#include +#include +#include +#include +#include + +namespace { + +// A special allocator that quickly allocates small objects +// and deallocates them roughly in the same order +struct Chunk { + // we'll use 1MB chunks + static constexpr size_t size = ChunkAllocator::MAX_CHUNK_SIZE; + // mutable because it is thread safe + // atomic because we ship these into + // a thread pool + mutable std::atomic refCount = 1; + unsigned offset = 0; + + constexpr static size_t beginOffset() { + return sizeof(Chunk) % 8 == 0 ? sizeof(Chunk) : sizeof(Chunk) + (8 - (sizeof(Chunk) % 8)); + } + static Chunk* create() { + void* buffer = aligned_alloc(8, Chunk::size + Chunk::beginOffset()); + return new (buffer) Chunk{}; + } + + void delref() const { + if (refCount.fetch_sub(1) == 1) { + ::aligned_free(const_cast(this)); + } + } + void addref() { + refCount.fetch_and(1); + } + static Chunk& getChunk(void* ptr) { + auto addr = reinterpret_cast(ptr); + unsigned offset = *reinterpret_cast(addr - 4); + return *reinterpret_cast(addr - offset); + } + + uintptr_t begin() const { + return reinterpret_cast(this) + Chunk::beginOffset(); + } + uintptr_t end() const { + return begin() + offset; + } + + void* allocate(size_t sz) { + auto off = offset; + void* res = reinterpret_cast(end()); + addref(); + } +private: + // make sure nobody constructs a Chunk directly + Chunk() {} +}; + + + +struct ChunkDeleter { + void operator()(Chunk* ptr) const { ptr->delref(); } +}; + +struct ChunkAllocatorImpl { + std::unique_ptr currentChunk; + + ChunkAllocatorImpl() : currentChunk(new Chunk{}) {} + + void* allocate(size_t sz) { + if (sz > ChunkAllocator::MAX_CHUNK_SIZE) { + auto res = reinterpret_cast(aligned_alloc(8, sz)); + // I don't think this is necessary, but it help to debug because now this + // means that the 8 bytes before will be 0x00000000ffffffff + res[0] = 0; + res[1] = std::numeric_limits::max(); + return res + 2; + } + void* res = nullptr; + if ((res = currentChunk->allocate(sz)) == nullptr) { + currentChunk.reset(new Chunk{}); + } + return currentChunk->allocate(sz); + } + + void free(void* ptr) { + auto i = reinterpret_cast(ptr); + auto off = *(i - 1); + if (off == std::numeric_limits::max()) { + aligned_free(i - 2); + } else { + auto addr = reinterpret_cast(ptr); + reinterpret_cast(addr - off)->delref(); + } + } +}; + +ChunkAllocatorImpl chunkAllocator; + +} // namespace + +namespace ChunkAllocator { + +void* allocate(size_t sz) { + return chunkAllocator.allocate(sz); +} + +void free(void* ptr) { + return chunkAllocator.free(ptr); +} + +} // namespace ChunkAllocator + +void* FBTraceImpl::operator new(std::size_t sz) { + return ChunkAllocator::allocate(sz); +} +void FBTraceImpl::operator delete(void* ptr) { + Chunk::getChunk(ptr).delref(); +} diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 3c76a108e0..64a4268642 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -3,7 +3,7 @@ * * This source file is part of the FoundationDB open source project * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * Copyright 2013-2020 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,18 +22,27 @@ #include "flow/FastRef.h" #include "flow/ObjectSerializer.h" +#include #include +namespace ChunkAllocator { + constexpr size_t MAX_CHUNK_SIZE = 1 << 20; + void* allocate(size_t sz); + void free(void* ptr); +}; + class FBTraceImpl : public ReferenceCounted { protected: virtual void write(ObjectWriter& writer) = 0; public: virtual ~FBTraceImpl(); + static void* operator new(std::size_t sz); + static void operator delete(void* ptr); }; template class FDBTrace : public FBTraceImpl { - protected: +protected: void write(ObjectWriter& writer) override { writer.serialize(*static_cast(this)); } From ca5946dd29925b002d17b96bd4afb81c676cd3e0 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 22 Apr 2020 11:51:31 -0700 Subject: [PATCH 11/27] Implemented a chunk allocator for FB messages --- flow/FBTrace.cpp | 265 +++++++++++++++++++++++++++++++++-------------- flow/FBTrace.h | 72 ++++++++++--- flow/Trace.cpp | 4 - 3 files changed, 245 insertions(+), 96 deletions(-) diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp index 0a31dc68da..3d27a65d78 100644 --- a/flow/FBTrace.cpp +++ b/flow/FBTrace.cpp @@ -20,6 +20,7 @@ #include "flow/FBTrace.h" #include "FastAlloc.h" +#include "FileIdentifier.h" #include "Platform.h" #include #include @@ -28,117 +29,223 @@ #include #include #include +#include +#include +#include namespace { -// A special allocator that quickly allocates small objects +struct TheChunkAllocator; + +// A special allocator that quickly allocates small objects // and deallocates them roughly in the same order struct Chunk { - // we'll use 1MB chunks - static constexpr size_t size = ChunkAllocator::MAX_CHUNK_SIZE; - // mutable because it is thread safe - // atomic because we ship these into - // a thread pool - mutable std::atomic refCount = 1; - unsigned offset = 0; + friend class TheChunkAllocator; + // we'll use 1MB chunks + static constexpr size_t size = ChunkAllocatorImpl::MAX_CHUNK_SIZE; + // mutable because it is thread safe + // atomic because we ship these into + // a thread pool + mutable std::atomic refCount = 1; - constexpr static size_t beginOffset() { - return sizeof(Chunk) % 8 == 0 ? sizeof(Chunk) : sizeof(Chunk) + (8 - (sizeof(Chunk) % 8)); - } - static Chunk* create() { - void* buffer = aligned_alloc(8, Chunk::size + Chunk::beginOffset()); - return new (buffer) Chunk{}; - } + TheChunkAllocator& theAllocator; + size_t freeSpace = size; - void delref() const { - if (refCount.fetch_sub(1) == 1) { - ::aligned_free(const_cast(this)); - } - } - void addref() { - refCount.fetch_and(1); - } - static Chunk& getChunk(void* ptr) { - auto addr = reinterpret_cast(ptr); - unsigned offset = *reinterpret_cast(addr - 4); - return *reinterpret_cast(addr - offset); - } + constexpr static size_t beginOffset() { + return sizeof(Chunk) % 8 == 0 ? sizeof(Chunk) : sizeof(Chunk) + (8 - (sizeof(Chunk) % 8)); + } + void delref(); + void addref() { refCount.fetch_and(1); } - uintptr_t begin() const { - return reinterpret_cast(this) + Chunk::beginOffset(); - } - uintptr_t end() const { - return begin() + offset; - } + static void* ptr_add(void* ptr, size_t offset) { return reinterpret_cast(ptr) + offset; } + + void* begin() const { return reinterpret_cast(reinterpret_cast(this) + Chunk::beginOffset()); } + void* end() const { return reinterpret_cast(begin()) + (Chunk::size - freeSpace); } + + uint32_t calcOffset(void* ptr) { + return uint32_t(reinterpret_cast(ptr) - reinterpret_cast(this)); + } + static Chunk& getChunk(void* ptr) { + auto addr = reinterpret_cast(ptr) - 4; + unsigned offset = *reinterpret_cast(addr); + return *reinterpret_cast(addr - offset); + } + + void* allocate(size_t sz) { + void* res = begin(); + res = std::align(4, sizeof(uint32_t), res, freeSpace); + if (res == nullptr) { + return nullptr; + } + if (sz > 16) { + res = std::align(16, sz, res, freeSpace); + } else if (sz > 8) { + res = std::align(8, sz, res, freeSpace); + } + if (res == nullptr) { + return nullptr; + } + auto offPtr = ptr_add(res, -4); + *reinterpret_cast(offPtr) = calcOffset(offPtr); + return res; + } - void* allocate(size_t sz) { - auto off = offset; - void* res = reinterpret_cast(end()); - addref(); - } private: - // make sure nobody constructs a Chunk directly - Chunk() {} + // make sure nobody constructs a Chunk directly + Chunk(TheChunkAllocator& theAllocator) : theAllocator(theAllocator) {} }; - - struct ChunkDeleter { - void operator()(Chunk* ptr) const { ptr->delref(); } + void operator()(Chunk* ptr) const { ptr->delref(); } }; -struct ChunkAllocatorImpl { - std::unique_ptr currentChunk; +struct TheChunkAllocator { + static constexpr size_t MAX_FREE_LIST_SIZE = 2; + std::vector freeList; + std::mutex freeListMutex; + std::unique_ptr currentChunk; - ChunkAllocatorImpl() : currentChunk(new Chunk{}) {} + Chunk* createChunk() { + void* buffer = aligned_alloc(8, Chunk::size + Chunk::beginOffset()); + return new (buffer) Chunk{ *this }; + } - void* allocate(size_t sz) { - if (sz > ChunkAllocator::MAX_CHUNK_SIZE) { - auto res = reinterpret_cast(aligned_alloc(8, sz)); - // I don't think this is necessary, but it help to debug because now this - // means that the 8 bytes before will be 0x00000000ffffffff - res[0] = 0; - res[1] = std::numeric_limits::max(); - return res + 2; - } - void* res = nullptr; - if ((res = currentChunk->allocate(sz)) == nullptr) { - currentChunk.reset(new Chunk{}); - } - return currentChunk->allocate(sz); - } + void freeChunk(Chunk* c) { + c->~Chunk(); + aligned_free(c); + } - void free(void* ptr) { - auto i = reinterpret_cast(ptr); - auto off = *(i - 1); - if (off == std::numeric_limits::max()) { - aligned_free(i - 2); - } else { - auto addr = reinterpret_cast(ptr); - reinterpret_cast(addr - off)->delref(); - } - } + TheChunkAllocator() : currentChunk(createChunk()) {} + + void* allocate(size_t sz) { + if (sz > ChunkAllocatorImpl::MAX_CHUNK_SIZE) { + auto res = reinterpret_cast(aligned_alloc(8, sz)); + // I don't think this is necessary, but it help to debug because now this + // means that the 8 bytes before will be 0x00000000ffffffff + res[0] = 0; + res[1] = std::numeric_limits::max(); + return res + 2; + } + void* res = nullptr; + if ((res = currentChunk->allocate(sz)) == nullptr) { + currentChunk.reset(createChunk()); + } + return currentChunk->allocate(sz); + } + + void free(void* ptr) { + auto i = reinterpret_cast(ptr); + auto off = *(i - 1); + if (off == std::numeric_limits::max()) { + aligned_free(i - 2); + } else { + auto addr = reinterpret_cast(ptr); + reinterpret_cast(addr - off)->delref(); + } + std::unique_lock _{ freeListMutex }; + if (freeList.size() > MAX_FREE_LIST_SIZE) { + freeChunk(freeList.back()); + freeList.pop_back(); + } + } }; -ChunkAllocatorImpl chunkAllocator; +void Chunk::delref() { + if (refCount.fetch_sub(1) == 1) { + freeSpace = Chunk::size; + std::unique_lock _{ theAllocator.freeListMutex }; + theAllocator.freeList.push_back(this); + } +} + +bool mainThreadIsRunning = true; +struct MainThreadRunning { + ~MainThreadRunning() { mainThreadIsRunning = false; } +}; +MainThreadRunning _mainThreadRunningHelper; + +class FBFactoryState { + using Lock = std::unique_lock; + FBFactoryState() {} + FBFactoryState(FBFactoryState const&) = delete; + FBFactoryState(FBFactoryState&&) = delete; + FBFactoryState& operator=(FBFactoryState const&) = delete; + FBFactoryState& operator=(FBFactoryState&&) = delete; + std::mutex mutex; + std::unordered_map factories; + +public: + void addFactory(FileIdentifier fId, FBFactory* f) { + Lock _{ mutex }; + ASSERT(factories.emplace(fId, f).second); + } + + static FBFactoryState& instance() { + static std::mutex constructionMutex; + static std::shared_ptr myInstance; + // this pointer makes sure that if the main thread has already + // gone down, any other thread will still have a valid copy + static thread_local std::shared_ptr this_copy; + if (this_copy) { + return *this_copy; + } + // we can't construct anymore of the main thread shut down + // throw an error instead - alternative is undefined behavior + ASSERT(mainThreadIsRunning); + if (myInstance) { + this_copy = myInstance; + } else { + Lock _(constructionMutex); + if (!myInstance) { + // this means we will leak this memory + // This prevents problems with multiple + // threads shutting down + myInstance = std::make_shared(); + } + } + this_copy = myInstance; + return *this_copy; + } +}; + +TheChunkAllocator chunkAllocator; } // namespace -namespace ChunkAllocator { +namespace ChunkAllocatorImpl { void* allocate(size_t sz) { - return chunkAllocator.allocate(sz); + return chunkAllocator.allocate(sz); } void free(void* ptr) { - return chunkAllocator.free(ptr); + return chunkAllocator.free(ptr); } -} // namespace ChunkAllocator +} // namespace ChunkAllocatorImpl + +FBFactory::FBFactory(FileIdentifier fid) { + FBFactoryState::instance().addFactory(fid, this); +} +FBFactory::~FBFactory() {} + +void FBTraceImpl::addref() const { + refCount.fetch_add(1); +} + +void FBTraceImpl::delref() const { + if (refCount.fetch_sub(1) == 1) { + delete this; + } +} void* FBTraceImpl::operator new(std::size_t sz) { - return ChunkAllocator::allocate(sz); + return ChunkAllocatorImpl::allocate(sz); } void FBTraceImpl::operator delete(void* ptr) { - Chunk::getChunk(ptr).delref(); + Chunk::getChunk(ptr).delref(); } + +FBTraceImpl::~FBTraceImpl() {} + +void fbTrace(Reference const& traceLine) {} diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 64a4268642..acbcb79e00 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -20,35 +20,81 @@ #pragma once +#include "FileIdentifier.h" #include "flow/FastRef.h" #include "flow/ObjectSerializer.h" #include #include -namespace ChunkAllocator { - constexpr size_t MAX_CHUNK_SIZE = 1 << 20; - void* allocate(size_t sz); - void free(void* ptr); +namespace ChunkAllocatorImpl { +constexpr size_t MAX_CHUNK_SIZE = 1 << 20; +void* allocate(size_t sz); +void free(void* ptr); +}; // namespace ChunkAllocatorImpl + +template +struct ChunkAllocator { + using value_type = T; + using pointer = value_type*; + using is_always_equal = std::true_type; + + pointer allocate(size_t count) { return ChunkAllocatorImpl::allocate(count * sizeof(value_type)); } + + void deallocate(pointer ptr, size_t) { ChunkAllocatorImpl::free(ptr); } }; -class FBTraceImpl : public ReferenceCounted { +class FBTraceImpl { + mutable std::atomic refCount = 1; + protected: virtual void write(ObjectWriter& writer) = 0; + virtual void read(ObjectReader& reader) = 0; + virtual void read(ArenaObjectReader& reader) = 0; + public: virtual ~FBTraceImpl(); + virtual FileIdentifier getFileIdentifier() const = 0; static void* operator new(std::size_t sz); static void operator delete(void* ptr); + void addref() const; + void delref() const; + +public: + FBTraceImpl() {} + FBTraceImpl(const FBTraceImpl& o) : refCount(1) {} + + FBTraceImpl& operator=(FBTraceImpl const& o) { return *this; } +}; + +struct FBFactory { + FBFactory(FileIdentifier fid); + virtual ~FBFactory(); + virtual Reference create() const = 0; + virtual FileIdentifier fileIdentified() const = 0; }; template -class FDBTrace : public FBTraceImpl { +class FBTrace : public FBTraceImpl { + struct FBFactoryImpl : FBFactory { + FBFactoryImpl() : FBFactory(T::file_identifier) {} + Reference create() const override { return Reference{ new T{} }; } + FileIdentifier fileIdentified() const override { return T::file_identifier; } + }; + static FBFactoryImpl factory; + protected: - void write(ObjectWriter& writer) override { - writer.serialize(*static_cast(this)); - } + void write(ObjectWriter& writer) override { writer.serialize(*static_cast(this)); } + void read(ObjectReader& reader) override { reader.deserialize(*static_cast(this)); } + void read(ArenaObjectReader& reader) { reader.deserialize(*static_cast(this)); } + +public: + FileIdentifier getFileIdentifier() const override { return T::file_identifier; } }; -class GetValueDebugTrace : public FDBTrace { +template +typename FBTrace::FBFactoryImpl FBTrace::factory; + +class GetValueDebugTrace : public FBTrace { public: constexpr static FileIdentifier file_identifier = 617894; enum codeLocation { @@ -78,7 +124,7 @@ public: } }; -class WatchValueDebugTrace : public FDBTrace { +class WatchValueDebugTrace : public FBTrace { public: constexpr static FileIdentifier file_identifier = 14486715; enum codeLocation { @@ -101,7 +147,7 @@ public: } }; -class CommitDebugTrace : public FDBTrace { +class CommitDebugTrace : public FBTrace { public: constexpr static FileIdentifier file_identifier = 7691518; enum codeLocation { @@ -140,7 +186,7 @@ public: } }; -class TransactionDebugTrace : public FDBTrace { +class TransactionDebugTrace : public FBTrace { public: constexpr static FileIdentifier file_identifier = 6868728; enum codeLocation { diff --git a/flow/Trace.cpp b/flow/Trace.cpp index c98c5c0bf6..31e644b864 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -1490,7 +1490,3 @@ std::string traceableStringToString(const char* value, size_t S) { return std::string(value, S - 1); // Exclude trailing \0 byte } - -FBTraceImpl::~FBTraceImpl() {} - -void fbTrace(Reference const& traceLine) {} From 451eb041fb99bdde9d11bce38efc91a51ddae9b7 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 22 Apr 2020 14:50:33 -0700 Subject: [PATCH 12/27] Made interface nicer --- fdbserver/StorageCache.actor.cpp | 2 +- flow/FBTrace.cpp | 19 ++++++++++--------- flow/FBTrace.h | 9 ++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 5e2471cb9a..45fa3e88b3 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -250,7 +250,7 @@ ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); //FIXME - fbTrace(Reference(new GetValueDebugTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ))); + fbTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ); } state Optional v; diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp index 3d27a65d78..2368591e94 100644 --- a/flow/FBTrace.cpp +++ b/flow/FBTrace.cpp @@ -166,7 +166,6 @@ MainThreadRunning _mainThreadRunningHelper; class FBFactoryState { using Lock = std::unique_lock; - FBFactoryState() {} FBFactoryState(FBFactoryState const&) = delete; FBFactoryState(FBFactoryState&&) = delete; FBFactoryState& operator=(FBFactoryState const&) = delete; @@ -175,6 +174,8 @@ class FBFactoryState { std::unordered_map factories; public: + FBFactoryState() {} // has to be public for std::make_shared + void addFactory(FileIdentifier fId, FBFactory* f) { Lock _{ mutex }; ASSERT(factories.emplace(fId, f).second); @@ -195,13 +196,13 @@ public: if (myInstance) { this_copy = myInstance; } else { - Lock _(constructionMutex); - if (!myInstance) { - // this means we will leak this memory - // This prevents problems with multiple - // threads shutting down - myInstance = std::make_shared(); - } + Lock _(constructionMutex); + if (!myInstance) { + // this means we will leak this memory + // This prevents problems with multiple + // threads shutting down + myInstance = std::make_shared(); + } } this_copy = myInstance; return *this_copy; @@ -248,4 +249,4 @@ void FBTraceImpl::operator delete(void* ptr) { FBTraceImpl::~FBTraceImpl() {} -void fbTrace(Reference const& traceLine) {} +void fbTraceImpl(Reference const& traceLine) {} diff --git a/flow/FBTrace.h b/flow/FBTrace.h index acbcb79e00..4dfdac12e7 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -239,10 +239,9 @@ public: } }; -void fbTrace(Reference const& traceLine); +void fbTraceImpl(Reference const& traceLine); -template -void fbTrace(Reference traceLine) { - static_assert(std::is_base_of::value, "fbTrace only accepts FBTraceImpl as argument"); - fbTrace(traceLine.template castTo()); +template +std::enable_if_t> fbTrace(Args&&... args) { + fbTraceImpl(Reference(new Type{std::forward(args)...})); } From 2f0339334fe6d613fe76d61f42afb3fa60a22c29 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 28 Apr 2020 20:52:23 +0000 Subject: [PATCH 13/27] moved dummy pool - started implementation of backend --- flow/FBTrace.cpp | 12 ++++++++++++ flow/FBTrace.h | 4 +++- flow/IThreadPool.h | 36 ++++++++++++++++++++++++++++++++++++ flow/Trace.cpp | 35 ----------------------------------- 4 files changed, 51 insertions(+), 36 deletions(-) diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp index 2368591e94..a7e16738c8 100644 --- a/flow/FBTrace.cpp +++ b/flow/FBTrace.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -211,6 +212,12 @@ public: TheChunkAllocator chunkAllocator; +struct FBTraceLog { + void open(const std::string& directory, const std::string& processName, unsigned rollsize, unsigned maxLogSize) {} +}; + +thread_local FBTraceLog g_fbTraceLog; + } // namespace namespace ChunkAllocatorImpl { @@ -249,4 +256,9 @@ void FBTraceImpl::operator delete(void* ptr) { FBTraceImpl::~FBTraceImpl() {} +void FBTraceImpl::open(const std::string& directory, const std::string& processName, unsigned rollsize, + unsigned maxLogSize) { + g_fbTraceLog.open(directory, processName, rollsize, maxLogSize); +} + void fbTraceImpl(Reference const& traceLine) {} diff --git a/flow/FBTrace.h b/flow/FBTrace.h index 4dfdac12e7..e0eb3fbdb4 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -56,6 +56,8 @@ public: virtual FileIdentifier getFileIdentifier() const = 0; static void* operator new(std::size_t sz); static void operator delete(void* ptr); + static void open(std::string const& directory, std::string const& processName, unsigned rollSize, + unsigned maxLogSize); void addref() const; void delref() const; @@ -243,5 +245,5 @@ void fbTraceImpl(Reference const& traceLine); template std::enable_if_t> fbTrace(Args&&... args) { - fbTraceImpl(Reference(new Type{std::forward(args)...})); + fbTraceImpl(Reference(new Type{ std::forward(args)... })); } diff --git a/flow/IThreadPool.h b/flow/IThreadPool.h index af76010895..bde2d88459 100644 --- a/flow/IThreadPool.h +++ b/flow/IThreadPool.h @@ -105,5 +105,41 @@ private: Reference createGenericThreadPool(); +class DummyThreadPool : public IThreadPool, ReferenceCounted { +public: + ~DummyThreadPool() {} + DummyThreadPool() : thread(NULL) {} + Future getError() { + return errors.getFuture(); + } + void addThread( IThreadPoolReceiver* userData ) { + ASSERT( !thread ); + thread = userData; + } + void post( PThreadAction action ) { + try { + (*action)( thread ); + } catch (Error& e) { + errors.sendError( e ); + } catch (...) { + errors.sendError( unknown_error() ); + } + } + Future stop(Error const& e) { + return Void(); + } + void addref() { + ReferenceCounted::addref(); + } + void delref() { + ReferenceCounted::delref(); + } + +private: + IThreadPoolReceiver* thread; + Promise errors; +}; + + #endif diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 31e644b864..92e3eba8b1 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -55,41 +55,6 @@ // during an open trace event thread_local int g_allocation_tracing_disabled = 1; -class DummyThreadPool : public IThreadPool, ReferenceCounted { -public: - ~DummyThreadPool() {} - DummyThreadPool() : thread(NULL) {} - Future getError() { - return errors.getFuture(); - } - void addThread( IThreadPoolReceiver* userData ) { - ASSERT( !thread ); - thread = userData; - } - void post( PThreadAction action ) { - try { - (*action)( thread ); - } catch (Error& e) { - errors.sendError( e ); - } catch (...) { - errors.sendError( unknown_error() ); - } - } - Future stop(Error const& e) { - return Void(); - } - void addref() { - ReferenceCounted::addref(); - } - void delref() { - ReferenceCounted::delref(); - } - -private: - IThreadPoolReceiver* thread; - Promise errors; -}; - struct SuppressionMap { struct SuppressionInfo { double endTime; From 99fda86fedb4cbe06713eea252fa23a5247a2eb4 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 4 May 2020 17:11:33 +0000 Subject: [PATCH 14/27] intermediate commit --- flow/FBTrace.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp index a7e16738c8..bc605588bb 100644 --- a/flow/FBTrace.cpp +++ b/flow/FBTrace.cpp @@ -213,7 +213,11 @@ public: TheChunkAllocator chunkAllocator; struct FBTraceLog { - void open(const std::string& directory, const std::string& processName, unsigned rollsize, unsigned maxLogSize) {} + std::string directory; + std::string processName; + + void open(const std::string& directory, const std::string& processName, unsigned rollsize, unsigned maxLogSize) { + } }; thread_local FBTraceLog g_fbTraceLog; From 1453ca6303a734a7b8b0446ad81361bdd3fc1096 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 5 May 2020 17:58:41 -0700 Subject: [PATCH 15/27] Added log writer --- flow/FBTrace.cpp | 106 +++++++++++++++++++++++++++++++++--- flow/FBTrace.h | 28 ++++++++-- flow/FileTraceLogWriter.cpp | 42 +++++++++++++- flow/FileTraceLogWriter.h | 31 +++++++++-- flow/Trace.cpp | 27 --------- flow/Trace.h | 4 ++ 6 files changed, 191 insertions(+), 47 deletions(-) diff --git a/flow/FBTrace.cpp b/flow/FBTrace.cpp index bc605588bb..15b78081f5 100644 --- a/flow/FBTrace.cpp +++ b/flow/FBTrace.cpp @@ -19,11 +19,18 @@ */ #include "flow/FBTrace.h" -#include "FastAlloc.h" -#include "FileIdentifier.h" -#include "Platform.h" +#include "flow/FileTraceLogWriter.h" +#include "flow/FastAlloc.h" +#include "flow/FileIdentifier.h" +#include "flow/IThreadPool.h" +#include "flow/Platform.h" +#include "flow/ThreadHelper.actor.h" +#include "flow/Trace.h" +#include "flow/network.h" +#include "flow/serialize.h" #include #include +#include #include #include #include @@ -212,11 +219,92 @@ public: TheChunkAllocator chunkAllocator; +template +struct ZeroEstimateAction : ChunkAllocated, TypedAction { + double getTimeEstimate() override { return 0; } +}; + +struct WriterThread : IThreadPoolReceiver { + Reference logWriter; + unsigned rollsize; + explicit WriterThread(Reference logWriter, unsigned rollsize) + : logWriter(logWriter), rollsize(rollsize) + {} + void init() override {} + struct Open : ZeroEstimateAction {}; + void action(Open&) { + logWriter->open(); + } + struct Close : ZeroEstimateAction {}; + void action(Close&) { + logWriter->close(); + } + struct Role : ZeroEstimateAction {}; + void action(Role&) { + logWriter->roll(); + } + struct Write : ZeroEstimateAction { + Reference msg; + explicit Write(Reference const& msg) : msg(msg) {} + }; + void action(Write& w) { + ObjectWriter writer(Unversioned()); + w.msg->write(writer); + logWriter->write(writer.toStringRef()); + } +}; + +using namespace std::literals; + struct FBTraceLog { + using Clock = std::chrono::steady_clock; std::string directory; std::string processName; + Reference logWriter; + Reference writer; + Reference issues; + Clock::time_point latestTrace; + bool opened = false; - void open(const std::string& directory, const std::string& processName, unsigned rollsize, unsigned maxLogSize) { + FBTraceLog() : issues(new IssuesList{}), latestTrace(Clock::now() - 1s) {} + + void traceIssues() { + // we want to trace at most once per second + if (Clock::now() < (latestTrace + 1s)) { + return; + } + std::set issueSet; + issues->retrieveIssues(issueSet); + onMainThread([&issueSet]() -> Future { + TraceEvent evt(SevError, "FBTraceHasIssues"); + evt.detail("NumIssues", issueSet.size()); + int cnt = 1; + for (auto const& s : issueSet) { + evt.detail(format("Issue%d", cnt++).c_str(), s); + } + return Void(); + }).getBlocking(); + } + + void open(const std::string& directory, const std::string& processName, const std::string& basename, + unsigned rollsize, unsigned maxLogsSize) { + if (g_network->isSimulated()) { + writer = Reference{ new DummyThreadPool{} }; + } else { + writer = createGenericThreadPool(); + } + logWriter = Reference(new FileTraceLogWriter( + directory, processName, basename, "fb", maxLogsSize, [this]() { traceIssues(); }, issues)); + writer->addThread(new WriterThread{logWriter, rollsize}); + writer->post(new WriterThread::Open{}); + opened = true; + } + + void write(Reference const& msg) { + if (!opened) { + return; + } + writer->post(new WriterThread::Write{msg}); } }; @@ -260,9 +348,11 @@ void FBTraceImpl::operator delete(void* ptr) { FBTraceImpl::~FBTraceImpl() {} -void FBTraceImpl::open(const std::string& directory, const std::string& processName, unsigned rollsize, - unsigned maxLogSize) { - g_fbTraceLog.open(directory, processName, rollsize, maxLogSize); +void FBTraceImpl::open(const std::string& directory, const std::string& processName, const std::string& basename, + unsigned rollsize, unsigned maxLogSize) { + g_fbTraceLog.open(directory, processName, basename, rollsize, maxLogSize); } -void fbTraceImpl(Reference const& traceLine) {} +void fbTraceImpl(Reference const& traceLine) { + g_fbTraceLog.write(traceLine); +} diff --git a/flow/FBTrace.h b/flow/FBTrace.h index e0eb3fbdb4..da9a240a7a 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -20,7 +20,7 @@ #pragma once -#include "FileIdentifier.h" +#include "flow/FileIdentifier.h" #include "flow/FastRef.h" #include "flow/ObjectSerializer.h" #include @@ -43,21 +43,39 @@ struct ChunkAllocator { void deallocate(pointer ptr, size_t) { ChunkAllocatorImpl::free(ptr); } }; +template +class ChunkAllocated { +public: + [[nodiscard]] static void* operator new(size_t s) { + if (s != sizeof(Object)) abort(); + INSTRUMENT_ALLOCATE(typeid(Object).name()); + void* p = ChunkAllocatorImpl::allocate(sizeof(Object)); + return p; + } + + static void operator delete(void* s) { + INSTRUMENT_RELEASE(typeid(Object).name()); + ChunkAllocatorImpl::free(s); + } + // Redefine placement new so you can still use it + static void* operator new(size_t, void* p) { return p; } + static void operator delete(void*, void*) {} +}; + class FBTraceImpl { mutable std::atomic refCount = 1; -protected: +public: virtual void write(ObjectWriter& writer) = 0; virtual void read(ObjectReader& reader) = 0; virtual void read(ArenaObjectReader& reader) = 0; -public: virtual ~FBTraceImpl(); virtual FileIdentifier getFileIdentifier() const = 0; static void* operator new(std::size_t sz); static void operator delete(void* ptr); - static void open(std::string const& directory, std::string const& processName, unsigned rollSize, - unsigned maxLogSize); + static void open(std::string const& directory, std::string const& processName, std::string const& basename, + unsigned rollSize, unsigned maxLogSize); void addref() const; void delref() const; diff --git a/flow/FileTraceLogWriter.cpp b/flow/FileTraceLogWriter.cpp index 3e4d0bdcd4..9df3bcf20c 100644 --- a/flow/FileTraceLogWriter.cpp +++ b/flow/FileTraceLogWriter.cpp @@ -48,6 +48,36 @@ #include #include +struct IssuesListImpl { + IssuesListImpl(){} + void addIssue(std::string issue) override { + MutexHolder h(mutex); + issues.insert(issue); + } + + void retrieveIssues(std::set& out) override { + MutexHolder h(mutex); + for (auto const& i : issues) { + out.insert(i); + } + } + + void resolveIssue(std::string issue) override { + MutexHolder h(mutex); + issues.erase(issue); + } + +private: + Mutex mutex; + std::set issues; +}; + +IssuesList::IssuesList() : impl(new IssuesListImpl{}) {} +IssuesList::~IssuesList() { delete impl; } +void IssuesList::addIssue(std::string issue) { impl->addIssue(issue); } +void IssuesList::retrieveIssues(std::set &out) { impl->retrieveIssues(out); } +void IssuesList::resolveIssue(std::string issue) { impl->resolveIssue(issue); } + FileTraceLogWriter::FileTraceLogWriter(std::string directory, std::string processName, std::string basename, std::string extension, uint64_t maxLogsSize, std::function onError, Reference issues) @@ -72,8 +102,16 @@ void FileTraceLogWriter::lastError(int err) { } void FileTraceLogWriter::write(const std::string& str) { - auto ptr = str.c_str(); - int remaining = str.size(); + write(str.data(), str.size()); +} + +void FileTraceLogWriter::write(const StringRef& str) { + write(reinterpret_cast(str.begin()), str.size()); +} + +void FileTraceLogWriter::write(const char* str, size_t len) { + auto ptr = str; + int remaining = len; bool needsResolve = false; while ( remaining ) { diff --git a/flow/FileTraceLogWriter.h b/flow/FileTraceLogWriter.h index 1a7d86a840..5ffec7817a 100644 --- a/flow/FileTraceLogWriter.h +++ b/flow/FileTraceLogWriter.h @@ -23,11 +23,29 @@ #define FLOW_FILE_TRACE_LOG_WRITER_H #pragma once +#include "flow/Arena.h" #include "flow/FastRef.h" #include "flow/Trace.h" #include +struct IssuesListImpl; +struct IssuesList : ITraceLogIssuesReporter, ThreadSafeReferenceCounted { + IssuesList(); + virtual ~IssuesList(); + void addIssue(std::string issue) override; + + void retrieveIssues(std::set& out) override; + + void resolveIssue(std::string issue) override; + + void addref() { ThreadSafeReferenceCounted::addref(); } + void delref() { ThreadSafeReferenceCounted::delref(); } + +private: + IssuesListImpl* impl; +}; + class FileTraceLogWriter : public ITraceLogWriter, ReferenceCounted { private: std::string directory; @@ -42,6 +60,8 @@ private: std::function onError; + void write(const char* str, size_t size); + public: FileTraceLogWriter(std::string directory, std::string processName, std::string basename, std::string extension, uint64_t maxLogsSize, std::function onError, Reference issues); @@ -51,11 +71,12 @@ public: void lastError(int err); - void write(const std::string& str); - void open(); - void close(); - void roll(); - void sync(); + void write(const std::string& str) override; + void write(StringRef const& str) override; + void open() override; + void close() override; + void roll() override; + void sync() override; void cleanupTraceFiles(); }; diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 92e3eba8b1..f79fac1176 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -195,33 +195,6 @@ public: } }; - struct IssuesList : ITraceLogIssuesReporter, ThreadSafeReferenceCounted { - IssuesList(){}; - void addIssue(std::string issue) override { - MutexHolder h(mutex); - issues.insert(issue); - } - - void retrieveIssues(std::set& out) override { - MutexHolder h(mutex); - for (auto const& i : issues) { - out.insert(i); - } - } - - void resolveIssue(std::string issue) override { - MutexHolder h(mutex); - issues.erase(issue); - } - - void addref() { ThreadSafeReferenceCounted::addref(); } - void delref() { ThreadSafeReferenceCounted::delref(); } - - private: - Mutex mutex; - std::set issues; - }; - Reference issues; Reference barriers; diff --git a/flow/Trace.h b/flow/Trace.h index 3aeb5a9a8d..1abd0f7a73 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -512,11 +512,14 @@ private: bool init( struct TraceInterval& ); }; +class StringRef; + struct ITraceLogWriter { virtual void open() = 0; virtual void roll() = 0; virtual void close() = 0; virtual void write(const std::string&) = 0; + virtual void write(const StringRef&) = 0; virtual void sync() = 0; virtual void addref() = 0; @@ -534,6 +537,7 @@ struct ITraceLogFormatter { }; struct ITraceLogIssuesReporter { + virtual ~ITraceLogIssuesReporter(); virtual void addIssue(std::string issue) = 0; virtual void resolveIssue(std::string issue) = 0; From 1c8db89ad494a4b68025c2aa850f5f42bb877b82 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 6 May 2020 14:18:53 -0700 Subject: [PATCH 16/27] Implement missing destructor --- flow/Trace.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flow/Trace.cpp b/flow/Trace.cpp index f79fac1176..e315a50027 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -55,6 +55,8 @@ // during an open trace event thread_local int g_allocation_tracing_disabled = 1; +ITraceLogIssuesReporter::~ITraceLogIssuesReporter() {} + struct SuppressionMap { struct SuppressionInfo { double endTime; From 8a7d98ab21b3fd5f00ea6f60e25f5661e506844b Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 27 May 2020 15:36:04 -0700 Subject: [PATCH 17/27] Started instrumentation --- fdbclient/FDBTypes.h | 1 + fdbclient/NativeAPI.actor.cpp | 12 ++++++++---- fdbclient/NativeAPI.actor.h | 5 ++++- fdbclient/StorageServerInterface.h | 14 +++++++++----- fdbserver/StorageCache.actor.cpp | 2 +- fdbserver/storageserver.actor.cpp | 8 ++++++-- flow/FBTrace.h | 17 +++++++++-------- flow/Trace.cpp | 4 ++++ 8 files changed, 42 insertions(+), 21 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 5a3ace0532..064b424367 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -35,6 +35,7 @@ typedef uint64_t Sequence; typedef StringRef KeyRef; typedef StringRef ValueRef; typedef int64_t Generation; +typedef UID TransactionID; enum { tagLocalitySpecial = -1, diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 28f764184b..9eef80ce3e 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -43,6 +43,7 @@ #include "fdbrpc/simulator.h" #include "flow/ActorCollection.h" #include "flow/DeterministicRandom.h" +#include "flow/IRandom.h" #include "flow/Knobs.h" #include "flow/Platform.h" #include "flow/SystemMonitor.h" @@ -1531,10 +1532,12 @@ ACTOR Future> getValue( Future version, Key key, Databa } choose { when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); } - when(GetValueReply _reply = - wait(loadBalance(ssi.second, &StorageServerInterface::getValue, - GetValueRequest(key, ver, cx->sampleReadTags() ? tags : Optional(), getValueID), TaskPriority::DefaultPromiseEndpoint, false, - cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { + when(GetValueReply _reply = wait( + loadBalance(ssi.second, &StorageServerInterface::getValue, + GetValueRequest(key, ver, cx->sampleReadTags() ? tags : Optional(), + info.id, getValueID), + TaskPriority::DefaultPromiseEndpoint, false, + cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { reply = _reply; } } @@ -2774,6 +2777,7 @@ void Transaction::reset() { void Transaction::fullReset() { reset(); + info.id = deterministicRandom()->randomUniqueID(); backoff = CLIENT_KNOBS->DEFAULT_BACKOFF; } diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index b32e980c85..5fcd0605a2 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -19,6 +19,7 @@ */ #pragma once +#include "flow/IRandom.h" #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_NATIVEAPI_ACTOR_G_H) #define FDBCLIENT_NATIVEAPI_ACTOR_G_H #include "fdbclient/NativeAPI.actor.g.h" @@ -147,13 +148,15 @@ class ReadYourWritesTransaction; // workaround cyclic dependency struct TransactionInfo { Optional debugID; TaskPriority taskID; + TransactionID id; bool useProvisionalProxies; // Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled // prefix/ : '1' - any keys equal or larger than this key are (probably) conflicting keys // prefix/ : '0' - any keys equal or larger than this key are (definitely) not conflicting keys std::shared_ptr> conflictingKeys; - explicit TransactionInfo( TaskPriority taskID ) : taskID(taskID), useProvisionalProxies(false) {} + explicit TransactionInfo(TaskPriority taskID) + : taskID(taskID), id(deterministicRandom()->randomUniqueID()), useProvisionalProxies(false) {} }; struct TransactionLogInfo : public ReferenceCounted, NonCopyable { diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 575d234159..107c36f775 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -175,15 +175,16 @@ struct GetValueRequest : TimedRequest { Key key; Version version; Optional tags; + TransactionID txnID; Optional debugID; ReplyPromise reply; GetValueRequest(){} - GetValueRequest(const Key& key, Version ver, Optional tags, Optional debugID) : key(key), version(ver), tags(tags), debugID(debugID) {} + GetValueRequest(const Key& key, Version ver, Optional tags, TransactionID txnID, Optional debugID) : key(key), version(ver), tags(tags), debugID(debugID) {} template void serialize( Ar& ar ) { - serializer(ar, key, version, tags, debugID, reply); + serializer(ar, key, version, tags, debugID, reply, txnID); } }; @@ -206,13 +207,16 @@ struct WatchValueRequest { Optional value; Version version; Optional tags; + TransactionID txnID; Optional debugID; ReplyPromise reply; WatchValueRequest(){} - WatchValueRequest(const Key& key, Optional value, Version ver, Optional tags, Optional debugID) : key(key), value(value), version(ver), tags(tags), debugID(debugID) {} - - template + WatchValueRequest(const Key& key, Optional value, Version ver, Optional tags, TransactionID txnID, + Optional debugID) + : key(key), value(value), version(ver), tags(tags), txnID(txnID), debugID(debugID) {} + + template void serialize( Ar& ar ) { serializer(ar, key, value, version, tags, debugID, reply); } diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp index 660b81afa2..656b9189ad 100644 --- a/fdbserver/StorageCache.actor.cpp +++ b/fdbserver/StorageCache.actor.cpp @@ -251,8 +251,8 @@ ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { if( req.debugID.present() ) { g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); //FIXME - fbTrace(req.debugID.get().first(), now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ); } + fbTrace(req.txnID, now(), GetValueDebugTrace::STORAGECACHE_GETVALUE_DO_READ); state Optional v; state Version version = wait( waitForVersion( data, req.version ) ); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 5a186ad2b8..a185fe661c 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -21,6 +21,7 @@ #include #include "fdbrpc/fdbrpc.h" #include "fdbrpc/LoadBalance.h" +#include "flow/FBTrace.h" #include "flow/IndexedSet.h" #include "flow/Hash3.h" #include "flow/ActorCollection.h" @@ -922,11 +923,13 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + fbTrace(req.txnID, GetValueDebugTrace::STORAGESERVER_GETVALUE_DO_READ); state Optional v; state Version version = wait( waitForVersion( data, req.version ) ); if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + fbTrace(req.txnID, GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTER_VERSION); state uint64_t changeCounter = data->shardChangeCounter; @@ -981,6 +984,7 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { if( req.debugID.present() ) g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + fbTrace(req.txnID, req.txnID, GetValueDebugTrace::STORAGESERVER_GETVALUE_AFTER_READ); GetValueReply reply(v); reply.penalty = data->getPenalty(); @@ -1018,7 +1022,7 @@ ACTOR Future watchValue_impl( StorageServer* data, WatchValueRequest req ) try { state Version latest = data->data().latestVersion; state Future watchFuture = data->watches.onChange(req.key); - GetValueRequest getReq( req.key, latest, req.tags, req.debugID ); + GetValueRequest getReq( req.key, latest, req.tags, req.txnID, req.debugID ); state Future getValue = getValueQ( data, getReq ); //we are relying on the delay zero at the top of getValueQ, if removed we need one here GetValueReply reply = wait( getReq.reply.getFuture() ); //TraceEvent("WatcherCheckValue").detail("Key", req.key ).detail("Value", req.value ).detail("CurrentValue", v ).detail("Ver", latest); @@ -3708,7 +3712,7 @@ ACTOR Future storageServerCore( StorageServer* self, StorageServerInterfac self->actors.add(serveWatchValueRequests(self, ssi.watchValue.getFuture())); self->transactionTagCounter.startNewInterval(self->thisServerID); - self->actors.add(recurring([this](){ self->transactionTagCounter.startNewInterval(self->thisServerID); }, SERVER_KNOBS->READ_TAG_MEASUREMENT_INTERVAL)); + self->actors.add(recurring([&](){ self->transactionTagCounter.startNewInterval(self->thisServerID); }, SERVER_KNOBS->READ_TAG_MEASUREMENT_INTERVAL)); self->coreStarted.send( Void() ); diff --git a/flow/FBTrace.h b/flow/FBTrace.h index da9a240a7a..622b329f61 100644 --- a/flow/FBTrace.h +++ b/flow/FBTrace.h @@ -20,6 +20,7 @@ #pragma once +#include "fdbclient/FDBTypes.h" #include "flow/FileIdentifier.h" #include "flow/FastRef.h" #include "flow/ObjectSerializer.h" @@ -132,12 +133,12 @@ public: READER_GETVALUEPREFIX_AFTER = 11 }; - uint64_t id; + TransactionID id; double time; int32_t location; GetValueDebugTrace() {} - GetValueDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + GetValueDebugTrace(TransactionID id, codeLocation loc, double time = g_network->now()) : id(id), time(time), location(loc) {} template void serialize(Ar& ar) { serializer(ar, id, time, location); @@ -155,12 +156,12 @@ public: NATIVEAPI_WATCHVALUE_AFTER_READ = 5 }; - uint64_t id; + TransactionID id; double time; int32_t location; WatchValueDebugTrace() {} - WatchValueDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + WatchValueDebugTrace(TransactionID id, double t, codeLocation loc) : id(id), time(t), location(loc) {} template void serialize(Ar& ar) { serializer(ar, id, time, location); @@ -194,12 +195,12 @@ public: TLOG_TLOGCOMMIT_AFTER = 20 }; - uint64_t id; + TransactionID id; double time; int32_t location; CommitDebugTrace() {} - CommitDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + CommitDebugTrace(TransactionID id, double t, codeLocation loc) : id(id), time(t), location(loc) {} template void serialize(Ar& ar) { serializer(ar, id, time, location); @@ -247,12 +248,12 @@ public: READWRITE_RANDOMREADWRITECLIENT_AFTER = 35 }; - uint64_t id; + TransactionID id; double time; int32_t location; TransactionDebugTrace() {} - TransactionDebugTrace(uint64_t debugID, double t, codeLocation loc) : id(debugID), time(t), location(loc) {} + TransactionDebugTrace(TransactionID id, double t, codeLocation loc) : id(id), time(t), location(loc) {} template void serialize(Ar& ar) { serializer(ar, id, time, location); diff --git a/flow/Trace.cpp b/flow/Trace.cpp index 369f4894ac..6441095014 100644 --- a/flow/Trace.cpp +++ b/flow/Trace.cpp @@ -22,6 +22,7 @@ #include "flow/Trace.h" #include "flow/FBTrace.h" #include "flow/FileTraceLogWriter.h" +#include "flow/Knobs.h" #include "flow/XmlTraceLogFormatter.h" #include "flow/JsonTraceLogFormatter.h" #include "flow/flow.h" @@ -295,6 +296,9 @@ public: formatter->getExtension(), maxLogsSize, [this]() { barriers->triggerAll(); }, issues)); + if (FLOW_KNOBS->FBTRACES_ENABLED) { + FBTraceImpl::open(directory, processName, basename, rs, maxLogsSize); + } if ( g_network->isSimulated() ) writer = Reference(new DummyThreadPool()); else From 09c136e4347feabd885cb81c2c96f51daef81415 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Mon, 8 Jun 2020 16:09:37 -0700 Subject: [PATCH 18/27] Framework for transaction tracing --- fdbclient/DatabaseContext.h | 4 +- fdbclient/FDBTypes.h | 2 +- fdbclient/MasterProxyInterface.h | 33 +- fdbclient/NativeAPI.actor.cpp | 84 +++- fdbclient/NativeAPI.actor.h | 8 +- fdbclient/StorageServerInterface.h | 29 +- fdbclient/vexillographer/fdb.options | 2 + fdbserver/BackupWorker.actor.cpp | 5 +- fdbserver/MasterProxyServer.actor.cpp | 39 +- fdbserver/StorageCache.actor.cpp | 3 +- fdbserver/fdbserver.actor.cpp | 25 +- fdbserver/storageserver.actor.cpp | 37 +- .../workloads/ConsistencyCheck.actor.cpp | 7 +- fdbserver/workloads/Cycle.actor.cpp | 26 +- flow/Arena.h | 3 + flow/CMakeLists.txt | 4 +- flow/FBTrace.cpp | 358 ------------------ flow/FBTrace.h | 268 ------------- flow/Knobs.cpp | 3 - flow/Knobs.h | 3 - flow/Trace.cpp | 4 - flow/Tracing.cpp | 82 ++++ flow/Tracing.h | 114 ++++++ 23 files changed, 419 insertions(+), 724 deletions(-) delete mode 100644 flow/FBTrace.cpp delete mode 100644 flow/FBTrace.h create mode 100644 flow/Tracing.cpp create mode 100644 flow/Tracing.h diff --git a/fdbclient/DatabaseContext.h b/fdbclient/DatabaseContext.h index 7e50f82516..b9b3399900 100644 --- a/fdbclient/DatabaseContext.h +++ b/fdbclient/DatabaseContext.h @@ -200,11 +200,13 @@ public: bool enableLocalityLoadBalance; struct VersionRequest { + SpanID spanID; Promise reply; TagSet tags; Optional debugID; - VersionRequest(TagSet tags = TagSet(), Optional debugID = Optional()) : tags(tags), debugID(debugID) {} + VersionRequest(SpanID spanID, TagSet tags = TagSet(), Optional debugID = Optional()) + : spanID(spanID), tags(tags), debugID(debugID) {} }; // Transaction start request batching diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index c5eccba5f7..7ae207a6e8 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -35,7 +35,7 @@ typedef uint64_t Sequence; typedef StringRef KeyRef; typedef StringRef ValueRef; typedef int64_t Generation; -typedef UID TransactionID; +typedef UID SpanID; enum { tagLocalitySpecial = -1, diff --git a/fdbclient/MasterProxyInterface.h b/fdbclient/MasterProxyInterface.h index 7216015535..11a339aa11 100644 --- a/fdbclient/MasterProxyInterface.h +++ b/fdbclient/MasterProxyInterface.h @@ -153,6 +153,7 @@ struct CommitTransactionRequest : TimedRequest { bool firstInBatch() const { return (flags & FLAG_FIRST_IN_BATCH) != 0; } Arena arena; + SpanID spanID; CommitTransactionRef transaction; ReplyPromise reply; uint32_t flags; @@ -162,7 +163,7 @@ struct CommitTransactionRequest : TimedRequest { template void serialize(Ar& ar) { - serializer(ar, transaction, reply, arena, flags, debugID); + serializer(ar, transaction, reply, arena, flags, debugID, spanID); } }; @@ -209,6 +210,7 @@ struct GetReadVersionRequest : TimedRequest { FLAG_PRIORITY_MASK = PRIORITY_SYSTEM_IMMEDIATE, }; + SpanID spanID; uint32_t transactionCount; uint32_t flags; TransactionPriority priority; @@ -219,9 +221,11 @@ struct GetReadVersionRequest : TimedRequest { ReplyPromise reply; GetReadVersionRequest() : transactionCount(1), flags(0) {} - GetReadVersionRequest(uint32_t transactionCount, TransactionPriority priority, uint32_t flags = 0, TransactionTagMap tags = TransactionTagMap(), Optional debugID = Optional()) - : transactionCount(transactionCount), priority(priority), flags(flags), tags(tags), debugID(debugID) - { + GetReadVersionRequest(SpanID spanID, uint32_t transactionCount, TransactionPriority priority, + uint32_t flags = 0, TransactionTagMap tags = TransactionTagMap(), + Optional debugID = Optional()) + : spanID(spanID), transactionCount(transactionCount), priority(priority), flags(flags), tags(tags), + debugID(debugID) { flags = flags & ~FLAG_PRIORITY_MASK; switch(priority) { case TransactionPriority::BATCH: @@ -237,12 +241,12 @@ struct GetReadVersionRequest : TimedRequest { ASSERT(false); } } - + bool operator < (GetReadVersionRequest const& rhs) const { return priority < rhs.priority; } template void serialize(Ar& ar) { - serializer(ar, transactionCount, flags, tags, debugID, reply); + serializer(ar, transactionCount, flags, tags, debugID, reply, spanID); if(ar.isDeserializing) { if((flags & PRIORITY_SYSTEM_IMMEDIATE) == PRIORITY_SYSTEM_IMMEDIATE) { @@ -275,6 +279,7 @@ struct GetKeyServerLocationsReply { struct GetKeyServerLocationsRequest { constexpr static FileIdentifier file_identifier = 9144680; Arena arena; + SpanID spanID; KeyRef begin; Optional end; int limit; @@ -282,24 +287,28 @@ struct GetKeyServerLocationsRequest { ReplyPromise reply; GetKeyServerLocationsRequest() : limit(0), reverse(false) {} - GetKeyServerLocationsRequest( KeyRef const& begin, Optional const& end, int limit, bool reverse, Arena const& arena ) : begin( begin ), end( end ), limit( limit ), reverse( reverse ), arena( arena ) {} - - template + GetKeyServerLocationsRequest(SpanID spanID, KeyRef const& begin, Optional const& end, int limit, + bool reverse, Arena const& arena) + : spanID(spanID), begin(begin), end(end), limit(limit), reverse(reverse), arena(arena) {} + + template void serialize(Ar& ar) { - serializer(ar, begin, end, limit, reverse, reply, arena); + serializer(ar, begin, end, limit, reverse, reply, spanID, arena); } }; struct GetRawCommittedVersionRequest { constexpr static FileIdentifier file_identifier = 12954034; + SpanID spanID; Optional debugID; ReplyPromise reply; - explicit GetRawCommittedVersionRequest(Optional const& debugID = Optional()) : debugID(debugID) {} + explicit GetRawCommittedVersionRequest(SpanID spanID, Optional const& debugID = Optional()) : spanID(spanID), debugID(debugID) {} + explicit GetRawCommittedVersionRequest() : spanID(), debugID() {} template void serialize( Ar& ar ) { - serializer(ar, debugID, reply); + serializer(ar, debugID, reply, spanID); } }; diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 5cbcbf4201..b1df86481b 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -20,6 +20,7 @@ #include "fdbclient/NativeAPI.actor.h" +#include <_types/_uint64_t.h> #include #include #include @@ -28,6 +29,7 @@ #include "fdbclient/ClusterInterface.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/DatabaseContext.h" +#include "fdbclient/FDBOptions.g.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/Knobs.h" #include "fdbclient/ManagementAPI.actor.h" @@ -48,9 +50,11 @@ #include "flow/Platform.h" #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" +#include "flow/Tracing.h" #include "flow/UnitTest.h" #include "fdbclient/versions.h" +#include "flow/serialize.h" #ifdef WIN32 #define WIN32_LEAN_AND_MEAN @@ -1336,6 +1340,7 @@ ACTOR Future>> transactionalGetServerInt //If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). Otherwise returns the shard containing key ACTOR Future< pair> > getKeyLocation_internal( Database cx, Key key, TransactionInfo info, bool isBackward = false ) { + state Span span("NA:getKeyLocation"_loc, { info.span->context }); if (isBackward) { ASSERT( key != allKeys.begin && key <= allKeys.end ); } else { @@ -1349,7 +1354,10 @@ ACTOR Future< pair> > getKeyLocation_internal( ++cx->transactionKeyServerLocationRequests; choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(key, Optional(), 100, isBackward, key.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { + when(GetKeyServerLocationsReply rep = wait(basicLoadBalance( + cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest(span->context, key, Optional(), 100, isBackward, key.arena()), + TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocation.After"); @@ -1381,6 +1389,7 @@ Future>> getKeyLocation( Database const& } ACTOR Future< vector< pair> > > getKeyRangeLocations_internal( Database cx, KeyRange keys, int limit, bool reverse, TransactionInfo info ) { + state Span span("NA:getKeyRangeLocations"_loc, { info.span->context }); if( info.debugID.present() ) g_traceBatch.addEvent("TransactionDebug", info.debugID.get().first(), "NativeAPI.getKeyLocations.Before"); @@ -1388,7 +1397,10 @@ ACTOR Future< vector< pair> > > getKeyRangeLoca ++cx->transactionKeyServerLocationRequests; choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetKeyServerLocationsReply _rep = wait( basicLoadBalance( cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest(keys.begin, keys.end, limit, reverse, keys.arena()), TaskPriority::DefaultPromiseEndpoint ) ) ) { + when(GetKeyServerLocationsReply _rep = wait(basicLoadBalance( + cx->getMasterProxies(info.useProvisionalProxies), &MasterProxyInterface::getKeyServersLocations, + GetKeyServerLocationsRequest(span->context, keys.begin, keys.end, limit, reverse, keys.arena()), + TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; if( info.debugID.present() ) @@ -1479,6 +1491,7 @@ Future Transaction::warmRange(Database cx, KeyRange keys) { ACTOR Future> getValue( Future version, Key key, Database cx, TransactionInfo info, Reference trLogInfo, TagSet tags ) { state Version ver = wait( version ); + state Span span("NA:getValue"_loc, { info.span->context }); cx->validateVersion(ver); loop { @@ -1513,8 +1526,8 @@ ACTOR Future> getValue( Future version, Key key, Databa when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetValueReply _reply = wait( loadBalance(ssi.second, &StorageServerInterface::getValue, - GetValueRequest(key, ver, cx->sampleReadTags() ? tags : Optional(), - info.id, getValueID), + GetValueRequest(span->context, key, ver, cx->sampleReadTags() ? tags : Optional(), + getValueID), TaskPriority::DefaultPromiseEndpoint, false, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { reply = _reply; @@ -1574,6 +1587,7 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T wait(success(version)); state Optional getKeyID = Optional(); + state Span span("NA:getKey"_loc, { info.span->context }); if( info.debugID.present() ) { getKeyID = nondeterministicRandom()->randomUniqueID(); @@ -1602,9 +1616,11 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T choose { when(wait(cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyReply _reply = - wait(loadBalance(ssi.second, &StorageServerInterface::getKey, GetKeyRequest(k, version.get(), cx->sampleReadTags() ? tags : Optional(), getKeyID), - TaskPriority::DefaultPromiseEndpoint, false, - cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { + wait(loadBalance(ssi.second, &StorageServerInterface::getKey, + GetKeyRequest(span->context, k, version.get(), + cx->sampleReadTags() ? tags : Optional(), getKeyID), + TaskPriority::DefaultPromiseEndpoint, false, + cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr))) { reply = _reply; } } @@ -1637,12 +1653,15 @@ ACTOR Future getKey( Database cx, KeySelector k, Future version, T } } -ACTOR Future waitForCommittedVersion( Database cx, Version version ) { +ACTOR Future waitForCommittedVersion( Database cx, Version version, SpanID spanID ) { + state Span span("NA:waitForCommittedVersion"_loc, { spanID }); try { loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, TransactionPriority::IMMEDIATE ), cx->taskID ) ) ) { + when(GetReadVersionReply v = wait(basicLoadBalance( + cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, + GetReadVersionRequest(span->context, 0, TransactionPriority::IMMEDIATE), cx->taskID))) { cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version); if (v.version >= version) @@ -1658,11 +1677,14 @@ ACTOR Future waitForCommittedVersion( Database cx, Version version ) { } } -ACTOR Future getRawVersion( Database cx ) { +ACTOR Future getRawVersion( Database cx, SpanID spanID ) { + state Span span("NA:getRawVersion"_loc, { spanID }); loop { choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} - when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, GetReadVersionRequest( 0, TransactionPriority::IMMEDIATE ), cx->taskID ) ) ) { + when(GetReadVersionReply v = + wait(basicLoadBalance(cx->getMasterProxies(false), &MasterProxyInterface::getConsistentReadVersion, + GetReadVersionRequest(spanID, 0, TransactionPriority::IMMEDIATE), cx->taskID))) { return v.version; } } @@ -1676,6 +1698,7 @@ ACTOR Future readVersionBatcher( ACTOR Future watchValue(Future version, Key key, Optional value, Database cx, TransactionInfo info, TagSet tags) { state Version ver = wait( version ); + state Span span(deterministicRandom()->randomUniqueID(), "NA:watchValue"_loc, { info.span->context }); cx->validateVersion(ver); ASSERT(ver != latestVersion); @@ -1692,9 +1715,11 @@ ACTOR Future watchValue(Future version, Key key, Optional } state WatchValueReply resp; choose { - when(WatchValueReply r = wait(loadBalance(ssi.second, &StorageServerInterface::watchValue, - WatchValueRequest(key, value, ver, cx->sampleReadTags() ? tags : Optional(), watchValueID), - TaskPriority::DefaultPromiseEndpoint))) { + when(WatchValueReply r = wait( + loadBalance(ssi.second, &StorageServerInterface::watchValue, + WatchValueRequest(info.span->context, key, value, ver, + cx->sampleReadTags() ? tags : Optional(), watchValueID), + TaskPriority::DefaultPromiseEndpoint))) { resp = r; } when(wait(cx->connectionFile ? cx->connectionFile->onChange() : Never())) { wait(Never()); } @@ -1705,7 +1730,7 @@ ACTOR Future watchValue(Future version, Key key, Optional //FIXME: wait for known committed version on the storage server before replying, //cannot do this until the storage server is notified on knownCommittedVersion changes from tlog (faster than the current update loop) - Version v = wait(waitForCommittedVersion(cx, resp.version)); + Version v = wait(waitForCommittedVersion(cx, resp.version, span->context)); //TraceEvent("WatcherCommitted").detail("CommittedVersion", v).detail("WatchVersion", resp.version).detail("Key", key ).detail("Value", value); @@ -1758,6 +1783,7 @@ ACTOR Future> getExactRange( Database cx, Version ver KeyRange keys, GetRangeLimits limits, bool reverse, TransactionInfo info, TagSet tags ) { state Standalone output; + state Span span("NA:getExactRange"_loc, { info.span->context }); //printf("getExactRange( '%s', '%s' )\n", keys.begin.toString().c_str(), keys.end.toString().c_str()); loop { @@ -1771,6 +1797,7 @@ ACTOR Future> getExactRange( Database cx, Version ver req.version = version; req.begin = firstGreaterOrEqual( range.begin ); req.end = firstGreaterOrEqual( range.end ); + req.spanID = span->context; transformRangeLimits(limits, reverse, req); ASSERT(req.limitBytes > 0 && req.limit != 0 && req.limit < 0 == reverse); @@ -2402,7 +2429,7 @@ ACTOR Future watch(Reference watch, Database cx, TagSet tags, Trans } Future Transaction::getRawReadVersion() { - return ::getRawVersion(cx); + return ::getRawVersion(cx, info.span->context); } Future< Void > Transaction::watch( Reference watch ) { @@ -2756,7 +2783,7 @@ void Transaction::reset() { void Transaction::fullReset() { reset(); - info.id = deterministicRandom()->randomUniqueID(); + info.span = Span(info.span->location); backoff = CLIENT_KNOBS->DEFAULT_BACKOFF; } @@ -2919,6 +2946,8 @@ void Transaction::setupWatches() { ACTOR static Future tryCommit( Database cx, Reference trLogInfo, CommitTransactionRequest req, Future readVersion, TransactionInfo info, Version* pCommittedVersion, Transaction* tr, TransactionOptions options) { state TraceInterval interval( "TransactionCommit" ); state double startTime = now(); + state Span span("NA:tryCommit"_loc, { info.span->context }); + req.spanID = span->context; if (info.debugID.present()) TraceEvent(interval.begin()).detail( "Parent", info.debugID.get() ); try { @@ -3342,6 +3371,14 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optionalparents.emplace(BinaryReader::fromStringRef(value.get(), Unversioned())); + break; + case FDBTransactionOptions::REPORT_CONFLICTING_KEYS: validateOptionValue(value, false); options.reportConflictingKeys = true; @@ -3352,13 +3389,15 @@ void Transaction::setOption( FDBTransactionOptions::Option option, Optional getConsistentReadVersion( DatabaseContext *cx, uint32_t transactionCount, TransactionPriority priority, uint32_t flags, TransactionTagMap tags, Optional debugID ) { +ACTOR Future getConsistentReadVersion(Span span, DatabaseContext* cx, uint32_t transactionCount, + TransactionPriority priority, uint32_t flags, + TransactionTagMap tags, Optional debugID) { try { ++cx->transactionReadVersionBatches; if( debugID.present() ) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getConsistentReadVersion.Before"); loop { - state GetReadVersionRequest req( transactionCount, priority, flags, tags, debugID ); + state GetReadVersionRequest req( span->context, transactionCount, priority, flags, tags, debugID ); choose { when ( wait( cx->onMasterProxiesChanged() ) ) {} when ( GetReadVersionReply v = wait( basicLoadBalance( cx->getMasterProxies(flags & GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES), &MasterProxyInterface::getConsistentReadVersion, req, cx->taskID ) ) ) { @@ -3409,6 +3448,7 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStream replyTimes; state PromiseStream _errorStream; state double batchTime = 0; + state Span span(deterministicRandom()->randomUniqueID(), "NAPI:readVersionBatcher"_loc); loop { send_batch = false; choose { @@ -3419,6 +3459,7 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStreamparents.insert(req.spanID); requests.push_back(req.reply); for(auto tag : req.tags) { ++tags[tag]; @@ -3446,7 +3487,8 @@ ACTOR Future readVersionBatcher( DatabaseContext *cx, FutureStream batch = incrementalBroadcastWithError( - getConsistentReadVersion(cx, count, priority, flags, std::move(tags), std::move(debugID)), + getConsistentReadVersion(Span("NAPI:getConsistentReadVersion"_loc, { span->context }), + cx, count, priority, flags, std::move(tags), std::move(debugID)), std::move(requests), CLIENT_KNOBS->BROADCAST_BATCH_SIZE); tags.clear(); @@ -3580,7 +3622,7 @@ Future Transaction::getReadVersion(uint32_t flags) { batcher.actor = readVersionBatcher( cx.getPtr(), batcher.stream.getFuture(), options.priority, flags ); } - auto const req = DatabaseContext::VersionRequest(options.tags, info.debugID); + auto const req = DatabaseContext::VersionRequest(info.span->context, options.tags, info.debugID); batcher.stream.send(req); startTime = now(); readVersion = extractReadVersion( cx.getPtr(), options.priority, trLogInfo, req.reply.getFuture(), options.lockAware, startTime, metadataVersion, options.tags); diff --git a/fdbclient/NativeAPI.actor.h b/fdbclient/NativeAPI.actor.h index 69eea2ea9e..17f1807bd0 100644 --- a/fdbclient/NativeAPI.actor.h +++ b/fdbclient/NativeAPI.actor.h @@ -20,6 +20,7 @@ #pragma once #include "flow/IRandom.h" +#include "flow/Tracing.h" #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_NATIVEAPI_ACTOR_G_H) #define FDBCLIENT_NATIVEAPI_ACTOR_G_H #include "fdbclient/NativeAPI.actor.g.h" @@ -148,7 +149,7 @@ class ReadYourWritesTransaction; // workaround cyclic dependency struct TransactionInfo { Optional debugID; TaskPriority taskID; - TransactionID id; + Span span; bool useProvisionalProxies; // Used to save conflicting keys if FDBTransactionOptions::REPORT_CONFLICTING_KEYS is enabled // prefix/ : '1' - any keys equal or larger than this key are (probably) conflicting keys @@ -156,7 +157,8 @@ struct TransactionInfo { std::shared_ptr> conflictingKeys; explicit TransactionInfo(TaskPriority taskID) - : taskID(taskID), id(deterministicRandom()->randomUniqueID()), useProvisionalProxies(false) {} + : taskID(taskID), span(deterministicRandom()->randomUniqueID(), "Transaction"_loc), useProvisionalProxies(false) { + } }; struct TransactionLogInfo : public ReferenceCounted, NonCopyable { @@ -332,7 +334,7 @@ private: Future committing; }; -ACTOR Future waitForCommittedVersion(Database cx, Version version); +ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanID spanID); ACTOR Future>> waitDataDistributionMetricsList(Database cx, KeyRange keys, int shardLimit); diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 122fd71926..9ccaa2bf04 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -169,19 +169,20 @@ struct GetValueReply : public LoadBalancedReply { struct GetValueRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 8454530; + SpanID spanID; Key key; Version version; Optional tags; - TransactionID txnID; Optional debugID; ReplyPromise reply; GetValueRequest(){} - GetValueRequest(const Key& key, Version ver, Optional tags, TransactionID txnID, Optional debugID) : key(key), version(ver), tags(tags), debugID(debugID) {} - - template + GetValueRequest(SpanID spanID, const Key& key, Version ver, Optional tags, Optional debugID) + : spanID(spanID), key(key), version(ver), tags(tags), debugID(debugID) {} + + template void serialize( Ar& ar ) { - serializer(ar, key, version, tags, debugID, reply, txnID); + serializer(ar, key, version, tags, debugID, reply, spanID); } }; @@ -200,22 +201,22 @@ struct WatchValueReply { struct WatchValueRequest { constexpr static FileIdentifier file_identifier = 14747733; + SpanID spanID; Key key; Optional value; Version version; Optional tags; - TransactionID txnID; Optional debugID; ReplyPromise reply; WatchValueRequest(){} - WatchValueRequest(const Key& key, Optional value, Version ver, Optional tags, TransactionID txnID, + WatchValueRequest(SpanID spanID, const Key& key, Optional value, Version ver, Optional tags, Optional debugID) - : key(key), value(value), version(ver), tags(tags), txnID(txnID), debugID(debugID) {} + : spanID(spanID), key(key), value(value), version(ver), tags(tags), debugID(debugID) {} template void serialize( Ar& ar ) { - serializer(ar, key, value, version, tags, debugID, reply); + serializer(ar, key, value, version, tags, debugID, reply, spanID); } }; @@ -237,6 +238,7 @@ struct GetKeyValuesReply : public LoadBalancedReply { struct GetKeyValuesRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 6795746; + SpanID spanID; Arena arena; KeySelectorRef begin, end; Version version; // or latestVersion @@ -249,7 +251,7 @@ struct GetKeyValuesRequest : TimedRequest { GetKeyValuesRequest() : isFetchKeys(false) {} template void serialize( Ar& ar ) { - serializer(ar, begin, end, version, limit, limitBytes, isFetchKeys, tags, debugID, reply, arena); + serializer(ar, begin, end, version, limit, limitBytes, isFetchKeys, tags, debugID, reply, spanID, arena); } }; @@ -268,6 +270,7 @@ struct GetKeyReply : public LoadBalancedReply { struct GetKeyRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 10457870; + SpanID spanID; Arena arena; KeySelectorRef sel; Version version; // or latestVersion @@ -276,11 +279,13 @@ struct GetKeyRequest : TimedRequest { ReplyPromise reply; GetKeyRequest() {} - GetKeyRequest(KeySelectorRef const& sel, Version version, Optional tags, Optional debugID) : sel(sel), version(version), debugID(debugID) {} + GetKeyRequest(SpanID spanID, KeySelectorRef const& sel, Version version, Optional tags, + Optional debugID) + : spanID(spanID), sel(sel), version(version), debugID(debugID) {} template void serialize( Ar& ar ) { - serializer(ar, sel, version, tags, debugID, reply, arena); + serializer(ar, sel, version, tags, debugID, reply, spanID, arena); } }; diff --git a/fdbclient/vexillographer/fdb.options b/fdbclient/vexillographer/fdb.options index d7463e5845..86af0f3f5c 100644 --- a/fdbclient/vexillographer/fdb.options +++ b/fdbclient/vexillographer/fdb.options @@ -268,6 +268,8 @@ description is not currently required but encouraged. description="Adds a tag to the transaction that can be used to apply manual targeted throttling. At most 5 tags can be set on a transaction." />