foundationdb/fdbserver/LogSystem.h

1161 lines
44 KiB
C++

/*
* LogSystem.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBSERVER_LOGSYSTEM_H
#define FDBSERVER_LOGSYSTEM_H
#include <set>
#include <vector>
#include "fdbserver/SpanContextMessage.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbserver/MutationTracking.h"
#include "flow/Arena.h"
#include "flow/Error.h"
#include "flow/Histogram.h"
#include "flow/IndexedSet.h"
#include "flow/Knobs.h"
#include "fdbrpc/ReplicationPolicy.h"
#include "fdbrpc/Locality.h"
#include "fdbrpc/Replication.h"
struct DBCoreState;
struct TLogSet;
struct CoreTLogSet;
struct ConnectionResetInfo : public ReferenceCounted<ConnectionResetInfo> {
double lastReset;
Future<Void> resetCheck;
int slowReplies;
int fastReplies;
ConnectionResetInfo() : lastReset(now()), resetCheck(Void()), slowReplies(0), fastReplies(0) {}
};
// The set of tLog servers, logRouters and backupWorkers for a log tag
class LogSet : NonCopyable, public ReferenceCounted<LogSet> {
public:
std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logServers;
std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> logRouters;
std::vector<Reference<AsyncVar<OptionalInterface<BackupInterface>>>> backupWorkers;
std::vector<Reference<ConnectionResetInfo>> connectionResetTrackers;
std::vector<Reference<Histogram>> tlogPushDistTrackers;
int32_t tLogWriteAntiQuorum;
int32_t tLogReplicationFactor;
std::vector<LocalityData> tLogLocalities; // Stores the localities of the log servers
TLogVersion tLogVersion;
Reference<IReplicationPolicy> tLogPolicy;
Reference<LocalitySet> logServerSet;
std::vector<int> logIndexArray;
std::vector<LocalityEntry> logEntryArray;
bool isLocal; // true if the LogSet is in primary DC or primary DC's satellite
int8_t locality;
Version startVersion;
std::vector<Future<TLogLockResult>> replies;
std::vector<std::vector<int>> satelliteTagLocations;
LogSet()
: tLogWriteAntiQuorum(0), tLogReplicationFactor(0), isLocal(true), locality(tagLocalityInvalid),
startVersion(invalidVersion) {}
LogSet(const TLogSet& tlogSet);
LogSet(const CoreTLogSet& coreSet);
std::string logRouterString() {
std::string result;
for (int i = 0; i < logRouters.size(); i++) {
if (i > 0) {
result += ", ";
}
result += logRouters[i]->get().id().toString();
}
return result;
}
bool hasLogRouter(UID id) const {
for (const auto& router : logRouters) {
if (router->get().id() == id) {
return true;
}
}
return false;
}
bool hasBackupWorker(UID id) const {
for (const auto& worker : backupWorkers) {
if (worker->get().id() == id) {
return true;
}
}
return false;
}
std::string logServerString() {
std::string result;
for (int i = 0; i < logServers.size(); i++) {
if (i > 0) {
result += ", ";
}
result += logServers[i]->get().id().toString();
}
return result;
}
void populateSatelliteTagLocations(int logRouterTags, int oldLogRouterTags, int txsTags, int oldTxsTags) {
satelliteTagLocations.clear();
satelliteTagLocations.resize(std::max({ logRouterTags, oldLogRouterTags, txsTags, oldTxsTags }) + 1);
std::map<int, int> server_usedBest;
std::set<std::pair<int, int>> used_servers;
for (int i = 0; i < tLogLocalities.size(); i++) {
used_servers.insert(std::make_pair(0, i));
}
Reference<LocalitySet> serverSet = Reference<LocalitySet>(new LocalityMap<std::pair<int, int>>());
LocalityMap<std::pair<int, int>>* serverMap = (LocalityMap<std::pair<int, int>>*)serverSet.getPtr();
std::vector<std::pair<int, int>> resultPairs;
for (int loc = 0; loc < satelliteTagLocations.size(); loc++) {
int team = loc;
if (loc < logRouterTags) {
team = loc + 1;
} else if (loc == logRouterTags) {
team = 0;
}
bool teamComplete = false;
alsoServers.resize(1);
serverMap->clear();
resultPairs.clear();
for (auto& used_idx : used_servers) {
auto entry = serverMap->add(tLogLocalities[used_idx.second], &used_idx);
if (!resultPairs.size()) {
resultPairs.push_back(used_idx);
alsoServers[0] = entry;
}
resultEntries.clear();
if (serverSet->selectReplicas(tLogPolicy, alsoServers, resultEntries)) {
for (auto& entry : resultEntries) {
resultPairs.push_back(*serverMap->getObject(entry));
}
int firstBestUsed = server_usedBest[resultPairs[0].second];
for (int i = 1; i < resultPairs.size(); i++) {
int thisBestUsed = server_usedBest[resultPairs[i].second];
if (thisBestUsed < firstBestUsed) {
std::swap(resultPairs[0], resultPairs[i]);
firstBestUsed = thisBestUsed;
}
}
server_usedBest[resultPairs[0].second]++;
for (auto& res : resultPairs) {
satelliteTagLocations[team].push_back(res.second);
used_servers.erase(res);
res.first++;
used_servers.insert(res);
}
teamComplete = true;
break;
}
}
ASSERT(teamComplete);
}
checkSatelliteTagLocations();
}
void checkSatelliteTagLocations() {
std::vector<int> usedBest;
std::vector<int> used;
usedBest.resize(tLogLocalities.size());
used.resize(tLogLocalities.size());
for (auto team : satelliteTagLocations) {
usedBest[team[0]]++;
for (auto loc : team) {
used[loc]++;
}
}
int minUsedBest = satelliteTagLocations.size();
int maxUsedBest = 0;
for (auto i : usedBest) {
minUsedBest = std::min(minUsedBest, i);
maxUsedBest = std::max(maxUsedBest, i);
}
int minUsed = satelliteTagLocations.size();
int maxUsed = 0;
for (auto i : used) {
minUsed = std::min(minUsed, i);
maxUsed = std::max(maxUsed, i);
}
bool foundDuplicate = false;
std::set<Optional<Key>> zones;
std::set<Optional<Key>> dcs;
for (auto& loc : tLogLocalities) {
if (zones.count(loc.zoneId())) {
foundDuplicate = true;
break;
}
zones.insert(loc.zoneId());
dcs.insert(loc.dcId());
}
bool moreThanOneDC = dcs.size() > 1 ? true : false;
TraceEvent(((maxUsed - minUsed > 1) || (maxUsedBest - minUsedBest > 1))
? (g_network->isSimulated() && !foundDuplicate && !moreThanOneDC ? SevError : SevWarnAlways)
: SevInfo,
"CheckSatelliteTagLocations")
.detail("MinUsed", minUsed)
.detail("MaxUsed", maxUsed)
.detail("MinUsedBest", minUsedBest)
.detail("MaxUsedBest", maxUsedBest)
.detail("DuplicateZones", foundDuplicate)
.detail("NumOfDCs", dcs.size());
}
int bestLocationFor(Tag tag) {
if (locality == tagLocalitySatellite) {
return satelliteTagLocations[tag == txsTag ? 0 : tag.id + 1][0];
}
// the following logic supports upgrades from 5.X
if (tag == txsTag)
return txsTagOld % logServers.size();
return tag.id % logServers.size();
}
void updateLocalitySet(std::vector<LocalityData> const& localities) {
LocalityMap<int>* logServerMap;
logServerSet = Reference<LocalitySet>(new LocalityMap<int>());
logServerMap = (LocalityMap<int>*)logServerSet.getPtr();
logEntryArray.clear();
logEntryArray.reserve(localities.size());
logIndexArray.clear();
logIndexArray.reserve(localities.size());
for (int i = 0; i < localities.size(); i++) {
logIndexArray.push_back(i);
logEntryArray.push_back(logServerMap->add(localities[i], &logIndexArray.back()));
}
}
bool satisfiesPolicy(const std::vector<LocalityEntry>& locations) {
resultEntries.clear();
// Run the policy, assert if unable to satify
bool result = logServerSet->selectReplicas(tLogPolicy, locations, resultEntries);
ASSERT(result);
return resultEntries.size() == 0;
}
void getPushLocations(VectorRef<Tag> tags,
std::vector<int>& locations,
int locationOffset,
bool allLocations = false) {
if (locality == tagLocalitySatellite) {
for (auto& t : tags) {
if (t == txsTag || t.locality == tagLocalityTxs || t.locality == tagLocalityLogRouter) {
for (int loc : satelliteTagLocations[t == txsTag ? 0 : t.id + 1]) {
locations.push_back(locationOffset + loc);
}
}
}
uniquify(locations);
return;
}
newLocations.clear();
alsoServers.clear();
resultEntries.clear();
if (allLocations) {
// special handling for allLocations
TraceEvent("AllLocationsSet").log();
for (int i = 0; i < logServers.size(); i++) {
newLocations.push_back(i);
}
} else {
for (auto& t : tags) {
if (locality == tagLocalitySpecial || t.locality == locality || t.locality < 0) {
newLocations.push_back(bestLocationFor(t));
}
}
}
uniquify(newLocations);
if (newLocations.size())
alsoServers.reserve(newLocations.size());
// Convert locations to the also servers
for (auto location : newLocations) {
locations.push_back(locationOffset + location);
alsoServers.push_back(logEntryArray[location]);
}
// Run the policy, assert if unable to satify
bool result = logServerSet->selectReplicas(tLogPolicy, alsoServers, resultEntries);
ASSERT(result);
// Add the new servers to the location array
LocalityMap<int>* logServerMap = (LocalityMap<int>*)logServerSet.getPtr();
for (auto entry : resultEntries) {
locations.push_back(locationOffset + *logServerMap->getObject(entry));
}
//TraceEvent("GetPushLocations").detail("Policy", tLogPolicy->info())
// .detail("Results", locations.size()).detail("Selection", logServerSet->size())
// .detail("Included", alsoServers.size()).detail("Duration", timer() - t);
}
private:
std::vector<LocalityEntry> alsoServers, resultEntries;
std::vector<int> newLocations;
};
struct ILogSystem {
// Represents a particular (possibly provisional) epoch of the log subsystem
struct IPeekCursor {
// clones the peek cursor, however you cannot call getMore() on the cloned cursor.
virtual Reference<IPeekCursor> cloneNoMore() = 0;
virtual void setProtocolVersion(ProtocolVersion version) = 0;
// if hasMessage() returns true, getMessage(), getMessageWithTags(), or reader() can be called.
// does not modify the cursor
virtual bool hasMessage() const = 0;
// pre: only callable if hasMessage() returns true
// return the tags associated with the message for the current sequence
virtual VectorRef<Tag> getTags() const = 0;
// pre: only callable if hasMessage() returns true
// returns the arena containing the contents of getMessage(), getMessageWithTags(), and reader()
virtual Arena& arena() = 0;
// pre: only callable if hasMessage() returns true
// returns an arena reader for the next message
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// the caller must advance the reader before calling nextMessage()
virtual ArenaReader* reader() = 0;
// pre: only callable if hasMessage() returns true
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// return the contents of the message for the current sequence
virtual StringRef getMessage() = 0;
// pre: only callable if hasMessage() returns true
// caller cannot call getMessage(), getMessageWithTags(), and reader()
// return the contents of the message for the current sequence
virtual StringRef getMessageWithTags() = 0;
// pre: only callable after getMessage(), getMessageWithTags(), or reader()
// post: hasMessage() and version() have been updated
// hasMessage() will never return false "in the middle" of a version (that is, if it does return false,
// version().subsequence will be zero) < FIXME: Can we lose this property?
virtual void nextMessage() = 0;
// advances the cursor to the supplied LogMessageVersion, and updates hasMessage
virtual void advanceTo(LogMessageVersion n) = 0;
// returns immediately if hasMessage() returns true.
// returns when either the result of hasMessage() or version() has changed, or a cursor has internally been
// exhausted.
virtual Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) = 0;
// returns when the failure monitor detects that the servers associated with the cursor are failed
virtual Future<Void> onFailed() = 0;
// returns false if:
// (1) the failure monitor detects that the servers associated with the cursor is failed
// (2) the interface is not present
// (3) the cursor cannot return any more results
virtual bool isActive() const = 0;
// returns true if the cursor cannot return any more results
virtual bool isExhausted() const = 0;
// Returns the smallest possible message version which the current message (if any) or a subsequent message
// might have (If hasMessage(), this is therefore the message version of the current message)
virtual const LogMessageVersion& version() const = 0;
// So far, the cursor has returned all messages which both satisfy the criteria passed to peek() to create the
// cursor AND have (popped(),0) <= message version number <= version() Other messages might have been skipped
virtual Version popped() const = 0;
// Returns the maximum version known to have been pushed (not necessarily durably) into the log system (0 is
// always a possible result!)
virtual Version getMaxKnownVersion() const { return 0; }
virtual Version getMinKnownCommittedVersion() const = 0;
virtual Optional<UID> getPrimaryPeekLocation() const = 0;
virtual Optional<UID> getCurrentPeekLocation() const = 0;
virtual void addref() = 0;
virtual void delref() = 0;
};
struct ServerPeekCursor final : IPeekCursor, ReferenceCounted<ServerPeekCursor> {
Reference<AsyncVar<OptionalInterface<TLogInterface>>> interf;
const Tag tag;
TLogPeekReply results;
ArenaReader rd;
LogMessageVersion messageVersion, end;
Version poppedVersion;
TagsAndMessage messageAndTags;
bool hasMsg;
Future<Void> more;
UID randomID;
bool returnIfBlocked;
bool onlySpilled;
bool parallelGetMore;
int sequence;
Deque<Future<TLogPeekReply>> futureResults;
Future<Void> interfaceChanged;
double lastReset;
Future<Void> resetCheck;
int slowReplies;
int fastReplies;
int unknownReplies;
ServerPeekCursor(Reference<AsyncVar<OptionalInterface<TLogInterface>>> const& interf,
Tag tag,
Version begin,
Version end,
bool returnIfBlocked,
bool parallelGetMore);
ServerPeekCursor(TLogPeekReply const& results,
LogMessageVersion const& messageVersion,
LogMessageVersion const& end,
TagsAndMessage const& message,
bool hasMsg,
Version poppedVersion,
Tag tag);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<ServerPeekCursor>::addref(); }
void delref() override { ReferenceCounted<ServerPeekCursor>::delref(); }
Version getMaxKnownVersion() const override { return results.maxKnownVersion; }
};
struct MergedPeekCursor final : IPeekCursor, ReferenceCounted<MergedPeekCursor> {
Reference<LogSet> logSet;
std::vector<Reference<IPeekCursor>> serverCursors;
std::vector<LocalityEntry> locations;
std::vector<std::pair<LogMessageVersion, int>> sortedVersions;
Tag tag;
int bestServer, currentCursor, readQuorum;
Optional<LogMessageVersion> nextVersion;
LogMessageVersion messageVersion;
bool hasNextMessage;
UID randomID;
int tLogReplicationFactor;
Future<Void> more;
MergedPeekCursor(std::vector<Reference<ILogSystem::IPeekCursor>> const& serverCursors, Version begin);
MergedPeekCursor(std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers,
int bestServer,
int readQuorum,
Tag tag,
Version begin,
Version end,
bool parallelGetMore,
std::vector<LocalityData> const& tLogLocalities,
Reference<IReplicationPolicy> const tLogPolicy,
int tLogReplicationFactor);
MergedPeekCursor(std::vector<Reference<IPeekCursor>> const& serverCursors,
LogMessageVersion const& messageVersion,
int bestServer,
int readQuorum,
Optional<LogMessageVersion> nextVersion,
Reference<LogSet> logSet,
int tLogReplicationFactor);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
void calcHasMessage();
void updateMessage(bool usePolicy);
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MergedPeekCursor>::addref(); }
void delref() override { ReferenceCounted<MergedPeekCursor>::delref(); }
};
struct SetPeekCursor final : IPeekCursor, ReferenceCounted<SetPeekCursor> {
std::vector<Reference<LogSet>> logSets;
std::vector<std::vector<Reference<IPeekCursor>>> serverCursors;
Tag tag;
int bestSet, bestServer, currentSet, currentCursor;
std::vector<LocalityEntry> locations;
std::vector<std::pair<LogMessageVersion, int>> sortedVersions;
Optional<LogMessageVersion> nextVersion;
LogMessageVersion messageVersion;
bool hasNextMessage;
bool useBestSet;
UID randomID;
Future<Void> more;
SetPeekCursor(std::vector<Reference<LogSet>> const& logSets,
int bestSet,
int bestServer,
Tag tag,
Version begin,
Version end,
bool parallelGetMore);
SetPeekCursor(std::vector<Reference<LogSet>> const& logSets,
std::vector<std::vector<Reference<IPeekCursor>>> const& serverCursors,
LogMessageVersion const& messageVersion,
int bestSet,
int bestServer,
Optional<LogMessageVersion> nextVersion,
bool useBestSet);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
void calcHasMessage();
void updateMessage(int logIdx, bool usePolicy);
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<SetPeekCursor>::addref(); }
void delref() override { ReferenceCounted<SetPeekCursor>::delref(); }
};
struct MultiCursor final : IPeekCursor, ReferenceCounted<MultiCursor> {
std::vector<Reference<IPeekCursor>> cursors;
std::vector<LogMessageVersion> epochEnds;
Version poppedVersion;
MultiCursor(std::vector<Reference<IPeekCursor>> cursors, std::vector<LogMessageVersion> epochEnds);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<MultiCursor>::addref(); }
void delref() override { ReferenceCounted<MultiCursor>::delref(); }
};
struct BufferedCursor final : IPeekCursor, ReferenceCounted<BufferedCursor> {
struct BufferedMessage {
Arena arena;
StringRef message;
VectorRef<Tag> tags;
LogMessageVersion version;
BufferedMessage() {}
explicit BufferedMessage(Version version) : version(version) {}
BufferedMessage(Arena arena,
StringRef message,
const VectorRef<Tag>& tags,
const LogMessageVersion& version)
: arena(arena), message(message), tags(tags), version(version) {}
bool operator<(BufferedMessage const& r) const { return version < r.version; }
bool operator==(BufferedMessage const& r) const { return version == r.version; }
};
std::vector<Reference<IPeekCursor>> cursors;
std::vector<Deque<BufferedMessage>> cursorMessages;
std::vector<BufferedMessage> messages;
int messageIndex;
LogMessageVersion messageVersion;
Version end;
bool hasNextMessage;
bool withTags;
bool knownUnique;
Version minKnownCommittedVersion;
Version poppedVersion;
Version initialPoppedVersion;
bool canDiscardPopped;
Future<Void> more;
int targetQueueSize;
UID randomID;
// FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support
// that upgrade.
bool collectTags;
void combineMessages();
BufferedCursor(std::vector<Reference<IPeekCursor>> cursors,
Version begin,
Version end,
bool withTags,
bool collectTags,
bool canDiscardPopped);
BufferedCursor(std::vector<Reference<AsyncVar<OptionalInterface<TLogInterface>>>> const& logServers,
Tag tag,
Version begin,
Version end,
bool parallelGetMore);
Reference<IPeekCursor> cloneNoMore() override;
void setProtocolVersion(ProtocolVersion version) override;
Arena& arena() override;
ArenaReader* reader() override;
bool hasMessage() const override;
void nextMessage() override;
StringRef getMessage() override;
StringRef getMessageWithTags() override;
VectorRef<Tag> getTags() const override;
void advanceTo(LogMessageVersion n) override;
Future<Void> getMore(TaskPriority taskID = TaskPriority::TLogPeekReply) override;
Future<Void> onFailed() override;
bool isActive() const override;
bool isExhausted() const override;
const LogMessageVersion& version() const override;
Version popped() const override;
Version getMinKnownCommittedVersion() const override;
Optional<UID> getPrimaryPeekLocation() const override;
Optional<UID> getCurrentPeekLocation() const override;
void addref() override { ReferenceCounted<BufferedCursor>::addref(); }
void delref() override { ReferenceCounted<BufferedCursor>::delref(); }
};
virtual void addref() = 0;
virtual void delref() = 0;
virtual std::string describe() const = 0;
virtual UID getDebugID() const = 0;
virtual void toCoreState(DBCoreState&) = 0;
virtual bool remoteStorageRecovered() = 0;
virtual Future<Void> onCoreStateChanged() = 0;
// Returns if and when the output of toCoreState() would change (for example, when older logs can be discarded from
// the state)
virtual void coreStateWritten(DBCoreState const& newState) = 0;
// Called when a core state has been written to the coordinators
virtual Future<Void> onError() = 0;
// Never returns normally, but throws an error if the subsystem stops working
// Future<Void> push( UID bundle, int64_t seq, VectorRef<TaggedMessageRef> messages );
virtual Future<Version> push(Version prevVersion,
Version version,
Version knownCommittedVersion,
Version minKnownCommittedVersion,
struct LogPushData& data,
SpanID const& spanContext,
Optional<UID> debugID = Optional<UID>()) = 0;
// Waits for the version number of the bundle (in this epoch) to be prevVersion (i.e. for all pushes ordered
// earlier) Puts the given messages into the bundle, each with the given tags, and with message versions (version,
// 0) - (version, N) Changes the version number of the bundle to be version (unblocking the next push) Returns when
// the preceding changes are durable. (Later we will need multiple return signals for diffferent durability levels)
// If the current epoch has ended, push will not return, and the pushed messages will not be visible in any
// subsequent epoch (but may become visible in this epoch)
virtual Reference<IPeekCursor> peek(UID dbgid,
Version begin,
Optional<Version> end,
Tag tag,
bool parallelGetMore = false) = 0;
// Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0),
// ordered by message version If pop was previously or concurrently called with upTo > begin, the cursor may not
// return all such messages. In that case cursor->popped() will be greater than begin to reflect that.
virtual Reference<IPeekCursor> peek(UID dbgid,
Version begin,
Optional<Version> end,
std::vector<Tag> tags,
bool parallelGetMore = false) = 0;
// Same contract as peek(), but for a set of tags
virtual Reference<IPeekCursor> peekSingle(
UID dbgid,
Version begin,
Tag tag,
std::vector<std::pair<Version, Tag>> history = std::vector<std::pair<Version, Tag>>()) = 0;
// Same contract as peek(), but blocks until the preferred log server(s) for the given tag are available (and is
// correspondingly less expensive)
virtual Reference<IPeekCursor> peekLogRouter(UID dbgid, Version begin, Tag tag) = 0;
// Same contract as peek(), but can only peek from the logs elected in the same generation.
// If the preferred log server is down, a different log from the same generation will merge results locally before
// sending them to the log router.
virtual Reference<IPeekCursor> peekTxs(UID dbgid,
Version begin,
int8_t peekLocality,
Version localEnd,
bool canDiscardPopped) = 0;
// Same contract as peek(), but only for peeking the txsLocality. It allows specifying a preferred peek locality.
virtual Future<Version> getTxsPoppedVersion() = 0;
virtual Version getKnownCommittedVersion() = 0;
virtual Future<Void> onKnownCommittedVersionChange() = 0;
virtual void popTxs(Version upTo, int8_t popLocality = tagLocalityInvalid) = 0;
virtual void pop(Version upTo,
Tag tag,
Version knownCommittedVersion = 0,
int8_t popLocality = tagLocalityInvalid) = 0;
// Permits, but does not require, the log subsystem to strip `tag` from any or all messages with message versions <
// (upTo,0) The popping of any given message may be arbitrarily delayed.
virtual Future<Void> confirmEpochLive(Optional<UID> debugID = Optional<UID>()) = 0;
// Returns success after confirming that pushes in the current epoch are still possible
virtual Future<Void> endEpoch() = 0;
// Ends the current epoch without starting a new one
static Reference<ILogSystem> fromServerDBInfo(
UID const& dbgid,
struct ServerDBInfo const& db,
bool useRecoveredAt = false,
Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>());
static Reference<ILogSystem> fromLogSystemConfig(
UID const& dbgid,
struct LocalityData const&,
struct LogSystemConfig const&,
bool excludeRemote = false,
bool useRecoveredAt = false,
Optional<PromiseStream<Future<Void>>> addActor = Optional<PromiseStream<Future<Void>>>());
// Constructs a new ILogSystem implementation from the given ServerDBInfo/LogSystemConfig. Might return a null
// reference if there isn't a fully recovered log system available. The caller can peek() the returned log system
// and can push() if it has version numbers reserved for it and prevVersions
static Reference<ILogSystem> fromOldLogSystemConfig(UID const& dbgid,
struct LocalityData const&,
struct LogSystemConfig const&);
// Constructs a new ILogSystem implementation from the old log data within a ServerDBInfo/LogSystemConfig. Might
// return a null reference if there isn't a fully recovered log system available.
static Future<Void> recoverAndEndEpoch(Reference<AsyncVar<Reference<ILogSystem>>> const& outLogSystem,
UID const& dbgid,
DBCoreState const& oldState,
FutureStream<TLogRejoinRequest> const& rejoins,
LocalityData const& locality,
bool* forceRecovery);
// Constructs a new ILogSystem implementation based on the given oldState and rejoining log servers
// Ensures that any calls to push or confirmEpochLive in the current epoch but strictly later than change_epoch will
// not return Whenever changes in the set of available log servers require restarting recovery with a different end
// sequence, outLogSystem will be changed to a new ILogSystem
virtual Version getEnd() const = 0;
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns the first unreadable version number of the recovered epoch (i.e. message version numbers < (get_end(), 0)
// will be readable)
// Returns the start version of current epoch for backup workers.
virtual Version getBackupStartVersion() const = 0;
struct EpochTagsVersionsInfo {
int32_t logRouterTags; // Number of log router tags.
Version epochBegin, epochEnd;
explicit EpochTagsVersionsInfo(int32_t n, Version begin, Version end)
: logRouterTags(n), epochBegin(begin), epochEnd(end) {}
};
// Returns EpochTagVersionsInfo for old epochs that this log system is aware of, excluding the current epoch.
virtual std::map<LogEpoch, EpochTagsVersionsInfo> getOldEpochTagsVersionsInfo() const = 0;
virtual Future<Reference<ILogSystem>> newEpoch(
struct RecruitFromConfigurationReply const& recr,
Future<struct RecruitRemoteFromConfigurationReply> const& fRemoteWorkers,
DatabaseConfiguration const& config,
LogEpoch recoveryCount,
int8_t primaryLocality,
int8_t remoteLocality,
std::vector<Tag> const& allTags,
Reference<AsyncVar<bool>> const& recruitmentStalled) = 0;
// Call only on an ILogSystem obtained from recoverAndEndEpoch()
// Returns an ILogSystem representing a new epoch immediately following this one. The new epoch is only provisional
// until the caller updates the coordinated DBCoreState
virtual LogSystemConfig getLogSystemConfig() const = 0;
// Returns the physical configuration of this LogSystem, that could be used to construct an equivalent LogSystem
// using fromLogSystemConfig()
virtual Standalone<StringRef> getLogsValue() const = 0;
virtual Future<Void> onLogSystemConfigChange() = 0;
// Returns when the log system configuration has changed due to a tlog rejoin.
virtual void getPushLocations(VectorRef<Tag> tags,
std::vector<int>& locations,
bool allLocations = false) const = 0;
void getPushLocations(std::vector<Tag> const& tags, std::vector<int>& locations, bool allLocations = false) {
getPushLocations(VectorRef<Tag>((Tag*)&tags.front(), tags.size()), locations, allLocations);
}
virtual bool hasRemoteLogs() const = 0;
virtual Tag getRandomRouterTag() const = 0;
virtual int getLogRouterTags() const = 0; // Returns the number of router tags.
virtual Tag getRandomTxsTag() const = 0;
// Returns the TLogVersion of the current generation of TLogs.
// (This only exists because getLogSystemConfig is a significantly more expensive call.)
virtual TLogVersion getTLogVersion() const = 0;
virtual void stopRejoins() = 0;
// Returns the pseudo tag to be popped for the given process class. If the
// process class doesn't use pseudo tag, return the same tag.
virtual Tag getPseudoPopTag(Tag tag, ProcessClass::ClassType type) const = 0;
virtual bool hasPseudoLocality(int8_t locality) const = 0;
// Returns the actual version to be popped from the log router tag for the given pseudo tag.
// For instance, a pseudo tag (-8, 2) means the actual popping tag is (-2, 2). Assuming there
// are multiple pseudo tags, the returned version is the min(all pseudo tags' "upTo" versions).
virtual Version popPseudoLocalityTag(Tag tag, Version upTo) = 0;
virtual void setBackupWorkers(const std::vector<InitializeBackupReply>& replies) = 0;
// Removes a finished backup worker from log system and returns true. Returns false
// if the worker is not found.
virtual bool removeBackupWorker(const BackupWorkerDoneRequest& req) = 0;
virtual LogEpoch getOldestBackupEpoch() const = 0;
virtual void setOldestBackupEpoch(LogEpoch epoch) = 0;
};
struct LengthPrefixedStringRef {
// Represents a pointer to a string which is prefixed by a 4-byte length
// A LengthPrefixedStringRef is only pointer-sized (8 bytes vs 12 bytes for StringRef), but the corresponding string
// is 4 bytes bigger, and substring operations aren't efficient as they are with StringRef. It's a good choice when
// there might be lots of references to the same exact string.
uint32_t* length;
StringRef toStringRef() const {
ASSERT(length);
return StringRef((uint8_t*)(length + 1), *length);
}
int expectedSize() const {
ASSERT(length);
return *length;
}
uint32_t* getLengthPtr() const { return length; }
LengthPrefixedStringRef() : length(nullptr) {}
LengthPrefixedStringRef(uint32_t* length) : length(length) {}
};
template <class T>
struct CompareFirst {
bool operator()(T const& lhs, T const& rhs) const { return lhs.first < rhs.first; }
};
// Structure to store serialized mutations sent from the proxy to the
// transaction logs. The serialization repeats with the following format:
//
// +----------------------+ +----------------------+ +----------+ +----------------+ +----------------------+
// | Message size | | Subsequence | | # of tags| | Tag | . . . . | Mutation |
// +----------------------+ +----------------------+ +----------+ +----------------+ +----------------------+
// <------- 32 bits ------> <------- 32 bits ------> <- 16 bits-> <---- 24 bits ---> <---- variable bits --->
//
// `Mutation` can be a serialized MutationRef or a special metadata message
// such as LogProtocolMessage or SpanContextMessage. The type of `Mutation` is
// uniquely identified by its first byte -- a value from MutationRef::Type.
//
struct LogPushData : NonCopyable {
// Log subsequences have to start at 1 (the MergedPeekCursor relies on this to make sure we never have !hasMessage()
// in the middle of data for a version
explicit LogPushData(Reference<ILogSystem> logSystem) : logSystem(logSystem), subsequence(1) {
for (auto& log : logSystem->getLogSystemConfig().tLogs) {
if (log.isLocal) {
for (int i = 0; i < log.tLogs.size(); i++) {
messagesWriter.push_back(BinaryWriter(AssumeVersion(g_network->protocolVersion())));
}
}
}
isEmptyMessage = std::vector<bool>(messagesWriter.size(), false);
}
void addTxsTag() {
if (logSystem->getTLogVersion() >= TLogVersion::V4) {
next_message_tags.push_back(logSystem->getRandomTxsTag());
} else {
next_message_tags.push_back(txsTag);
}
}
// addTag() adds a tag for the *next* message to be added
void addTag(Tag tag) { next_message_tags.push_back(tag); }
template <class T>
void addTags(T tags) {
next_message_tags.insert(next_message_tags.end(), tags.begin(), tags.end());
}
// Add transaction info to be written before the first mutation in the transaction.
void addTransactionInfo(SpanID const& context) {
TEST(!spanContext.isValid()); // addTransactionInfo with invalid SpanID
spanContext = context;
writtenLocations.clear();
}
void writeMessage(StringRef rawMessageWithoutLength, bool usePreviousLocations) {
if (!usePreviousLocations) {
prev_tags.clear();
if (logSystem->hasRemoteLogs()) {
prev_tags.push_back(logSystem->getRandomRouterTag());
}
for (auto& tag : next_message_tags) {
prev_tags.push_back(tag);
}
msg_locations.clear();
logSystem->getPushLocations(prev_tags, msg_locations);
next_message_tags.clear();
}
uint32_t subseq = this->subsequence++;
uint32_t msgsize =
rawMessageWithoutLength.size() + sizeof(subseq) + sizeof(uint16_t) + sizeof(Tag) * prev_tags.size();
for (int loc : msg_locations) {
BinaryWriter& wr = messagesWriter[loc];
wr << msgsize << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr.serializeBytes(rawMessageWithoutLength);
}
}
template <class T>
void writeTypedMessage(T const& item, bool metadataMessage = false, bool allLocations = false) {
prev_tags.clear();
if (logSystem->hasRemoteLogs()) {
prev_tags.push_back(logSystem->getRandomRouterTag());
}
for (auto& tag : next_message_tags) {
prev_tags.push_back(tag);
}
msg_locations.clear();
logSystem->getPushLocations(prev_tags, msg_locations, allLocations);
BinaryWriter bw(AssumeVersion(g_network->protocolVersion()));
// Metadata messages (currently LogProtocolMessage is the only metadata
// message) should be written before span information. If this isn't a
// metadata message, make sure all locations have had transaction info
// written to them. Mutations may have different sets of tags, so it
// is necessary to check all tag locations each time a mutation is
// written.
if (!metadataMessage) {
uint32_t subseq = this->subsequence++;
bool updatedLocation = false;
for (int loc : msg_locations) {
updatedLocation = writeTransactionInfo(loc, subseq) || updatedLocation;
}
// If this message doesn't write to any new locations, the
// subsequence wasn't actually used and can be decremented.
if (!updatedLocation) {
this->subsequence--;
TEST(true); // No new SpanContextMessage written to transaction logs
ASSERT(this->subsequence > 0);
}
} else {
// When writing a metadata message, make sure transaction state has
// been reset. If you are running into this assertion, make sure
// you are calling addTransactionInfo before each transaction.
ASSERT(writtenLocations.size() == 0);
}
uint32_t subseq = this->subsequence++;
bool first = true;
int firstOffset = -1, firstLength = -1;
for (int loc : msg_locations) {
BinaryWriter& wr = messagesWriter[loc];
if (first) {
firstOffset = wr.getLength();
wr << uint32_t(0) << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr << item;
firstLength = wr.getLength() - firstOffset;
*(uint32_t*)((uint8_t*)wr.getData() + firstOffset) = firstLength - sizeof(uint32_t);
DEBUG_TAGS_AND_MESSAGE("ProxyPushLocations",
invalidVersion,
StringRef(((uint8_t*)wr.getData() + firstOffset), firstLength))
.detail("PushLocations", msg_locations);
first = false;
} else {
BinaryWriter& from = messagesWriter[msg_locations[0]];
wr.serializeBytes((uint8_t*)from.getData() + firstOffset, firstLength);
}
}
next_message_tags.clear();
}
Standalone<StringRef> getMessages(int loc) {
return messagesWriter[loc].toValue();
}
// Records if a tlog (specified by "loc") will receive an empty version batch message.
// "value" is the message returned by getMessages() call.
void recordEmptyMessage(int loc, const Standalone<StringRef>& value) {
if (!isEmptyMessage[loc]) {
BinaryWriter w(AssumeVersion(g_network->protocolVersion()));
Standalone<StringRef> v = w.toValue();
if (value.size() > v.size()) {
isEmptyMessage[loc] = true;
}
}
}
// Returns the ratio of empty messages in this version batch.
// MUST be called after getMessages() and recordEmptyMessage().
float getEmptyMessageRatio() const {
auto count = std::count(isEmptyMessage.begin(), isEmptyMessage.end(), false);
ASSERT_WE_THINK(isEmptyMessage.size() > 0);
return 1.0 * count / isEmptyMessage.size();
}
private:
Reference<ILogSystem> logSystem;
std::vector<Tag> next_message_tags;
std::vector<Tag> prev_tags;
std::vector<BinaryWriter> messagesWriter;
std::vector<bool> isEmptyMessage; // if messagesWriter has written anything
std::vector<int> msg_locations;
// Stores message locations that have had span information written to them
// for the current transaction. Adding transaction info will reset this
// field.
std::unordered_set<int> writtenLocations;
uint32_t subsequence;
SpanID spanContext;
// Writes transaction info to the message stream at the given location if
// it has not already been written (for the current transaction). Returns
// true on a successful write, and false if the location has already been
// written.
bool writeTransactionInfo(int location, uint32_t subseq) {
if (!FLOW_KNOBS->WRITE_TRACING_ENABLED || logSystem->getTLogVersion() < TLogVersion::V6 ||
writtenLocations.count(location) != 0) {
return false;
}
TEST(true); // Wrote SpanContextMessage to a transaction log
writtenLocations.insert(location);
BinaryWriter& wr = messagesWriter[location];
SpanContextMessage contextMessage(spanContext);
int offset = wr.getLength();
wr << uint32_t(0) << subseq << uint16_t(prev_tags.size());
for (auto& tag : prev_tags)
wr << tag;
wr << contextMessage;
int length = wr.getLength() - offset;
*(uint32_t*)((uint8_t*)wr.getData() + offset) = length - sizeof(uint32_t);
return true;
}
};
#endif