foundationdb/fdbserver/storageserver.actor.cpp

4161 lines
171 KiB
C++

/*
* storageserver.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cinttypes>
#include "fdbrpc/fdbrpc.h"
#include "fdbrpc/LoadBalance.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Tracing.h"
#include "flow/IndexedSet.h"
#include "flow/Hash3.h"
#include "flow/ActorCollection.h"
#include "flow/SystemMonitor.h"
#include "flow/Util.h"
#include "fdbclient/Atomic.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/KeyRangeMap.h"
#include "fdbclient/MasterProxyInterface.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/Notified.h"
#include "fdbclient/StatusClient.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/VersionedMap.h"
#include "fdbserver/FDBExecHelper.actor.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LatencyBandConfig.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/MutationTracking.h"
#include "fdbserver/RecoveryState.h"
#include "fdbserver/StorageMetrics.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/TLogInterface.h"
#include "fdbserver/WaitFailure.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "fdbrpc/sim_validation.h"
#include "fdbrpc/Smoother.h"
#include "flow/Stats.h"
#include "flow/TDMetric.actor.h"
#include <type_traits>
#include "flow/actorcompiler.h" // This must be the last #include.
using std::pair;
using std::make_pair;
#ifndef __INTEL_COMPILER
#pragma region Data Structures
#endif
#define SHORT_CIRCUT_ACTUAL_STORAGE 0
inline bool canReplyWith(Error e) {
switch(e.code()) {
case error_code_transaction_too_old:
case error_code_future_version:
case error_code_wrong_shard_server:
case error_code_process_behind:
//case error_code_all_alternatives_failed:
return true;
default:
return false;
};
}
struct AddingShard : NonCopyable {
KeyRange keys;
Future<Void> fetchClient; // holds FetchKeys() actor
Promise<Void> fetchComplete;
Promise<Void> readWrite;
std::deque< Standalone<VerUpdateRef> > updates; // during the Fetching phase, mutations with key in keys and version>=(fetchClient's) fetchVersion;
struct StorageServer* server;
Version transferredVersion;
enum Phase { WaitPrevious, Fetching, Waiting };
Phase phase;
AddingShard( StorageServer* server, KeyRangeRef const& keys );
// When fetchKeys "partially completes" (splits an adding shard in two), this is used to construct the left half
AddingShard( AddingShard* prev, KeyRange const& keys )
: keys(keys), fetchClient(prev->fetchClient), server(prev->server), transferredVersion(prev->transferredVersion), phase(prev->phase)
{
}
~AddingShard() {
if( !fetchComplete.isSet() )
fetchComplete.send(Void());
if( !readWrite.isSet() )
readWrite.send(Void());
}
void addMutation( Version version, MutationRef const& mutation );
bool isTransferred() const { return phase == Waiting; }
};
struct ShardInfo : ReferenceCounted<ShardInfo>, NonCopyable {
AddingShard* adding;
struct StorageServer* readWrite;
KeyRange keys;
uint64_t changeCounter;
ShardInfo(KeyRange keys, AddingShard* adding, StorageServer* readWrite)
: adding(adding), readWrite(readWrite), keys(keys)
{
}
~ShardInfo() {
delete adding;
}
static ShardInfo* newNotAssigned(KeyRange keys) { return new ShardInfo(keys, NULL, NULL); }
static ShardInfo* newReadWrite(KeyRange keys, StorageServer* data) { return new ShardInfo(keys, NULL, data); }
static ShardInfo* newAdding(StorageServer* data, KeyRange keys) { return new ShardInfo(keys, new AddingShard(data, keys), NULL); }
static ShardInfo* addingSplitLeft( KeyRange keys, AddingShard* oldShard) { return new ShardInfo(keys, new AddingShard(oldShard, keys), NULL); }
bool isReadable() const { return readWrite!=NULL; }
bool notAssigned() const { return !readWrite && !adding; }
bool assigned() const { return readWrite || adding; }
bool isInVersionedData() const { return readWrite || (adding && adding->isTransferred()); }
void addMutation( Version version, MutationRef const& mutation );
bool isFetched() const { return readWrite || ( adding && adding->fetchComplete.isSet() ); }
const char* debugDescribeState() const {
if (notAssigned()) return "NotAssigned";
else if (adding && !adding->isTransferred()) return "AddingFetching";
else if (adding) return "AddingTransferred";
else return "ReadWrite";
}
};
struct StorageServerDisk {
explicit StorageServerDisk( struct StorageServer* data, IKeyValueStore* storage ) : data(data), storage(storage) {}
void makeNewStorageServerDurable();
bool makeVersionMutationsDurable( Version& prevStorageVersion, Version newStorageVersion, int64_t& bytesLeft );
void makeVersionDurable( Version version );
Future<bool> restoreDurableState();
void changeLogProtocol(Version version, ProtocolVersion protocol);
void writeMutation( MutationRef mutation );
void writeKeyValue( KeyValueRef kv );
void clearRange( KeyRangeRef keys );
Future<Void> getError() { return storage->getError(); }
Future<Void> init() { return storage->init(); }
Future<Void> commit() { return storage->commit(); }
// SOMEDAY: Put readNextKeyInclusive in IKeyValueStore
Future<Key> readNextKeyInclusive( KeyRef key ) { return readFirstKey(storage, KeyRangeRef(key, allKeys.end)); }
Future<Optional<Value>> readValue( KeyRef key, Optional<UID> debugID = Optional<UID>() ) { return storage->readValue(key, debugID); }
Future<Optional<Value>> readValuePrefix( KeyRef key, int maxLength, Optional<UID> debugID = Optional<UID>() ) { return storage->readValuePrefix(key, maxLength, debugID); }
Future<Standalone<RangeResultRef>> readRange( KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30 ) { return storage->readRange(keys, rowLimit, byteLimit); }
KeyValueStoreType getKeyValueStoreType() const { return storage->getType(); }
StorageBytes getStorageBytes() const { return storage->getStorageBytes(); }
std::tuple<size_t, size_t, size_t> getSize() const { return storage->getSize(); }
private:
struct StorageServer* data;
IKeyValueStore* storage;
void writeMutations( MutationListRef mutations, Version debugVersion, const char* debugContext );
ACTOR static Future<Key> readFirstKey( IKeyValueStore* storage, KeyRangeRef range ) {
Standalone<RangeResultRef> r = wait( storage->readRange( range, 1 ) );
if (r.size()) return r[0].key;
else return range.end;
}
};
struct UpdateEagerReadInfo {
std::vector<KeyRef> keyBegin;
std::vector<Key> keyEnd; // these are for ClearRange
std::vector<std::pair<KeyRef, int>> keys;
std::vector<Optional<Value>> value;
Arena arena;
void addMutations( VectorRef<MutationRef> const& mutations ) {
for(auto& m : mutations)
addMutation(m);
}
void addMutation( MutationRef const& m ) {
// SOMEDAY: Theoretically we can avoid a read if there is an earlier overlapping ClearRange
if (m.type == MutationRef::ClearRange && !m.param2.startsWith(systemKeys.end))
keyBegin.push_back( m.param2 );
else if (m.type == MutationRef::CompareAndClear) {
keyBegin.push_back(keyAfter(m.param1, arena));
if (keys.size() > 0 && keys.back().first == m.param1) {
// Don't issue a second read, if the last read was equal to the current key.
// CompareAndClear is likely to be used after another atomic operation on same key.
keys.back().second = std::max(keys.back().second, m.param2.size() + 1);
} else {
keys.emplace_back(m.param1, m.param2.size() + 1);
}
} else if ((m.type == MutationRef::AppendIfFits) || (m.type == MutationRef::ByteMin) ||
(m.type == MutationRef::ByteMax))
keys.emplace_back(m.param1, CLIENT_KNOBS->VALUE_SIZE_LIMIT);
else if (isAtomicOp((MutationRef::Type) m.type))
keys.emplace_back(m.param1, m.param2.size());
}
void finishKeyBegin() {
std::sort(keyBegin.begin(), keyBegin.end());
keyBegin.resize( std::unique(keyBegin.begin(), keyBegin.end()) - keyBegin.begin() );
std::sort(keys.begin(), keys.end(), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return (lhs.first < rhs.first) || (lhs.first == rhs.first && lhs.second > rhs.second); } );
keys.resize(std::unique(keys.begin(), keys.end(), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return lhs.first == rhs.first; } ) - keys.begin());
//value gets populated in doEagerReads
}
Optional<Value>& getValue(KeyRef key) {
int i = std::lower_bound(keys.begin(), keys.end(), pair<KeyRef, int>(key, 0), [](const pair<KeyRef, int>& lhs, const pair<KeyRef, int>& rhs) { return lhs.first < rhs.first; } ) - keys.begin();
ASSERT( i < keys.size() && keys[i].first == key );
return value[i];
}
KeyRef getKeyEnd( KeyRef key ) {
int i = std::lower_bound(keyBegin.begin(), keyBegin.end(), key) - keyBegin.begin();
ASSERT( i < keyBegin.size() && keyBegin[i] == key );
return keyEnd[i];
}
};
const int VERSION_OVERHEAD = 64 + sizeof(Version) + sizeof(Standalone<VersionUpdateRef>) + //mutationLog, 64b overhead for map
2 * (64 + sizeof(Version) + sizeof(Reference<VersionedMap<KeyRef, ValueOrClearToRef>::PTreeT>)); //versioned map [ x2 for createNewVersion(version+1) ], 64b overhead for map
static int mvccStorageBytes( MutationRef const& m ) { return VersionedMap<KeyRef, ValueOrClearToRef>::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2; }
struct FetchInjectionInfo {
Arena arena;
vector<VerUpdateRef> changes;
};
struct StorageServer {
typedef VersionedMap<KeyRef, ValueOrClearToRef> VersionedData;
private:
// versionedData contains sets and clears.
// * Nonoverlapping: No clear overlaps a set or another clear, or adjoins another clear.
// ~ Clears are maximal: If versionedData.at(v) contains a clear [b,e) then
// there is a key data[e]@v, or e==allKeys.end, or a shard boundary or former boundary at e
// * Reads are possible: When k is in a readable shard, for any v in [storageVersion, version.get()],
// storage[k] + versionedData.at(v)[k] = database[k] @ v (storage[k] might be @ any version in [durableVersion, storageVersion])
// * Transferred shards are partially readable: When k is in an adding, transferred shard, for any v in [transferredVersion, version.get()],
// storage[k] + versionedData.at(v)[k] = database[k] @ v
// * versionedData contains versions [storageVersion(), version.get()]. It might also contain version (version.get()+1), in which changeDurableVersion may be deleting ghosts, and/or it might
// contain later versions if applyUpdate is on the stack.
// * Old shards are erased: versionedData.atLatest() has entries (sets or intersecting clears) only for keys in readable or adding,transferred shards.
// Earlier versions may have extra entries for shards that *were* readable or adding,transferred when those versions were the latest, but they eventually are forgotten.
// * Old mutations are erased: All items in versionedData.atLatest() have insertVersion() > durableVersion(), but views
// at older versions may contain older items which are also in storage (this is OK because of idempotency)
VersionedData versionedData;
std::map<Version, Standalone<VersionUpdateRef>> mutationLog; // versions (durableVersion, version]
public:
Tag tag;
vector<pair<Version,Tag>> history;
vector<pair<Version,Tag>> allHistory;
Version poppedAllAfter;
std::map<Version, Arena> freeable; // for each version, an Arena that must be held until that version is < oldestVersion
Arena lastArena;
double cpuUsage;
double diskUsage;
std::map<Version, Standalone<VersionUpdateRef>> const& getMutationLog() const { return mutationLog; }
std::map<Version, Standalone<VersionUpdateRef>>& getMutableMutationLog() { return mutationLog; }
VersionedData const& data() const { return versionedData; }
VersionedData& mutableData() { return versionedData; }
double old_rate = 1.0;
double currentRate() {
auto versionLag = version.get() - durableVersion.get();
double res;
if (versionLag >= SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX) {
res = 0.0;
} else if (versionLag > SERVER_KNOBS->STORAGE_DURABILITY_LAG_SOFT_MAX) {
res = 1.0 - (double(versionLag - SERVER_KNOBS->STORAGE_DURABILITY_LAG_SOFT_MAX) / double(SERVER_KNOBS->STORAGE_DURABILITY_LAG_HARD_MAX-SERVER_KNOBS->STORAGE_DURABILITY_LAG_SOFT_MAX));
} else {
res = 1.0;
}
if (res != old_rate) {
TraceEvent(SevDebug, "LocalRatekeeperChange", thisServerID)
.detail("Old", old_rate)
.detail("New", res)
.detail("NonDurableVersions", versionLag);
old_rate = res;
}
return res;
}
void addMutationToMutationLogOrStorage( Version ver, MutationRef m ); // Appends m to mutationLog@ver, or to storage if ver==invalidVersion
// Update the byteSample, and write the updates to the mutation log@ver, or to storage if ver==invalidVersion
void byteSampleApplyMutation( MutationRef const& m, Version ver );
void byteSampleApplySet( KeyValueRef kv, Version ver );
void byteSampleApplyClear( KeyRangeRef range, Version ver );
void popVersion(Version v, bool popAllTags = false) {
if(logSystem) {
if(v > poppedAllAfter) {
popAllTags = true;
poppedAllAfter = std::numeric_limits<Version>::max();
}
vector<pair<Version,Tag>>* hist = &history;
vector<pair<Version,Tag>> allHistoryCopy;
if(popAllTags) {
allHistoryCopy = allHistory;
hist = &allHistoryCopy;
}
while(hist->size() && v > hist->back().first ) {
logSystem->pop( v, hist->back().second );
hist->pop_back();
}
if(hist->size()) {
logSystem->pop( v, hist->back().second );
} else {
logSystem->pop( v, tag );
}
}
}
Standalone<VersionUpdateRef>& addVersionToMutationLog(Version v) {
// return existing version...
auto m = mutationLog.find(v);
if (m != mutationLog.end())
return m->second;
// ...or create a new one
auto& u = mutationLog[v];
u.version = v;
if (lastArena.getSize() >= 65536) lastArena = Arena(4096);
u.arena() = lastArena;
counters.bytesInput += VERSION_OVERHEAD;
return u;
}
MutationRef addMutationToMutationLog(Standalone<VersionUpdateRef> &mLV, MutationRef const& m){
byteSampleApplyMutation(m, mLV.version);
counters.bytesInput += mvccStorageBytes(m);
return mLV.mutations.push_back_deep( mLV.arena(), m );
}
StorageServerDisk storage;
KeyRangeMap< Reference<ShardInfo> > shards;
uint64_t shardChangeCounter; // max( shards->changecounter )
KeyRangeMap <bool> cachedRangeMap; // indicates if a key-range is being cached
// newestAvailableVersion[k]
// == invalidVersion -> k is unavailable at all versions
// <= storageVersion -> k is unavailable at all versions (but might be read anyway from storage if we are in the process of committing makeShardDurable)
// == v -> k is readable (from storage+versionedData) @ [storageVersion,v], and not being updated when version increases
// == latestVersion -> k is readable (from storage+versionedData) @ [storageVersion,version.get()], and thus stays available when version increases
CoalescedKeyRangeMap< Version > newestAvailableVersion;
CoalescedKeyRangeMap< Version > newestDirtyVersion; // Similar to newestAvailableVersion, but includes (only) keys that were only partly available (due to cancelled fetchKeys)
// The following are in rough order from newest to oldest
Version lastTLogVersion, lastVersionWithData, restoredVersion;
NotifiedVersion version;
NotifiedVersion desiredOldestVersion; // We can increase oldestVersion (and then durableVersion) to this version when the disk permits
NotifiedVersion oldestVersion; // See also storageVersion()
NotifiedVersion durableVersion; // At least this version will be readable from storage after a power failure
Version rebootAfterDurableVersion;
int8_t primaryLocality;
Deque<std::pair<Version,Version>> recoveryVersionSkips;
int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this storage server
ProtocolVersion logProtocol;
Reference<ILogSystem> logSystem;
Reference<ILogSystem::IPeekCursor> logCursor;
UID thisServerID;
Key sk;
Reference<AsyncVar<ServerDBInfo>> db;
Database cx;
ActorCollection actors;
StorageServerMetrics metrics;
CoalescedKeyRangeMap<bool, int64_t, KeyBytesMetric<int64_t>> byteSampleClears;
AsyncVar<bool> byteSampleClearsTooLarge;
Future<Void> byteSampleRecovery;
Future<Void> durableInProgress;
AsyncMap<Key,bool> watches;
int64_t watchBytes;
int64_t numWatches;
AsyncVar<bool> noRecentUpdates;
double lastUpdate;
Int64MetricHandle readQueueSizeMetric;
std::string folder;
// defined only during splitMutations()/addMutation()
UpdateEagerReadInfo *updateEagerReads;
FlowLock durableVersionLock;
FlowLock fetchKeysParallelismLock;
vector< Promise<FetchInjectionInfo*> > readyFetchKeys;
int64_t instanceID;
Promise<Void> otherError;
Promise<Void> coreStarted;
bool shuttingDown;
bool behind;
bool versionBehind;
bool debug_inApplyUpdate;
double debug_lastValidateTime;
int maxQueryQueue;
int getAndResetMaxQueryQueueSize() {
int val = maxQueryQueue;
maxQueryQueue = 0;
return val;
}
struct TransactionTagCounter {
struct TagInfo {
TransactionTag tag;
double rate;
double fractionalBusyness;
TagInfo(TransactionTag const& tag, double rate, double fractionalBusyness)
: tag(tag), rate(rate), fractionalBusyness(fractionalBusyness) {}
};
TransactionTagMap<int64_t> intervalCounts;
int64_t intervalTotalSampledCount = 0;
TransactionTag busiestTag;
int64_t busiestTagCount = 0;
double intervalStart = 0;
Optional<TagInfo> previousBusiestTag;
int64_t costFunction(int64_t bytes) {
return bytes / SERVER_KNOBS->OPERATION_COST_BYTE_FACTOR + 1;
}
void addRequest(Optional<TagSet> const& tags, int64_t bytes) {
if(tags.present()) {
TEST(true); // Tracking tag on storage server
double cost = costFunction(bytes);
for(auto& tag : tags.get()) {
int64_t &count = intervalCounts[TransactionTag(tag, tags.get().arena)];
count += cost;
if(count > busiestTagCount) {
busiestTagCount = count;
busiestTag = tag;
}
}
intervalTotalSampledCount += cost;
}
}
void startNewInterval(UID id) {
double elapsed = now() - intervalStart;
previousBusiestTag.reset();
if (intervalStart > 0 && CLIENT_KNOBS->READ_TAG_SAMPLE_RATE > 0 && elapsed > 0) {
double rate = busiestTagCount / CLIENT_KNOBS->READ_TAG_SAMPLE_RATE / elapsed;
if(rate > SERVER_KNOBS->MIN_TAG_PAGES_READ_RATE) {
previousBusiestTag = TagInfo(busiestTag, rate, (double)busiestTagCount / intervalTotalSampledCount);
}
TraceEvent("BusiestReadTag", id)
.detail("Elapsed", elapsed)
.detail("Tag", printable(busiestTag))
.detail("TagCost", busiestTagCount)
.detail("TotalSampledCost", intervalTotalSampledCount)
.detail("Reported", previousBusiestTag.present())
.trackLatest(id.toString() + "/BusiestReadTag");
}
intervalCounts.clear();
intervalTotalSampledCount = 0;
busiestTagCount = 0;
intervalStart = now();
}
Optional<TagInfo> getBusiestTag() const {
return previousBusiestTag;
}
};
TransactionTagCounter transactionTagCounter;
Optional<LatencyBandConfig> latencyBandConfig;
struct Counters {
CounterCollection cc;
Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, rowsQueried, bytesQueried, watchQueries, emptyQueries;
Counter bytesInput, bytesDurable, bytesFetched,
mutationBytes; // Like bytesInput but without MVCC accounting
Counter sampledBytesCleared;
Counter mutations, setMutations, clearRangeMutations, atomicMutations;
Counter updateBatches, updateVersions;
Counter loops;
Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount;
Counter readsRejected;
LatencyBands readLatencyBands;
Counters(StorageServer* self)
: cc("StorageServer", self->thisServerID.toString()),
getKeyQueries("GetKeyQueries", cc),
getValueQueries("GetValueQueries",cc),
getRangeQueries("GetRangeQueries", cc),
allQueries("QueryQueue", cc),
finishedQueries("FinishedQueries", cc),
rowsQueried("RowsQueried", cc),
bytesQueried("BytesQueried", cc),
watchQueries("WatchQueries", cc),
emptyQueries("EmptyQueries", cc),
bytesInput("BytesInput", cc),
bytesDurable("BytesDurable", cc),
bytesFetched("BytesFetched", cc),
mutationBytes("MutationBytes", cc),
sampledBytesCleared("SampledBytesCleared", cc),
mutations("Mutations", cc),
setMutations("SetMutations", cc),
clearRangeMutations("ClearRangeMutations", cc),
atomicMutations("AtomicMutations", cc),
updateBatches("UpdateBatches", cc),
updateVersions("UpdateVersions", cc),
loops("Loops", cc),
fetchWaitingMS("FetchWaitingMS", cc),
fetchWaitingCount("FetchWaitingCount", cc),
fetchExecutingMS("FetchExecutingMS", cc),
fetchExecutingCount("FetchExecutingCount", cc),
readsRejected("ReadsRejected", cc),
readLatencyBands("ReadLatencyMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY)
{
specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; });
specialCounter(cc, "Version", [self](){ return self->version.get(); });
specialCounter(cc, "StorageVersion", [self](){ return self->storageVersion(); });
specialCounter(cc, "DurableVersion", [self](){ return self->durableVersion.get(); });
specialCounter(cc, "DesiredOldestVersion", [self](){ return self->desiredOldestVersion.get(); });
specialCounter(cc, "VersionLag", [self](){ return self->versionLag; });
specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; });
specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); });
specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); });
specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); });
specialCounter(cc, "QueryQueueMax", [self](){ return self->getAndResetMaxQueryQueueSize(); });
specialCounter(cc, "BytesStored", [self](){ return self->metrics.byteSample.getEstimate(allKeys); });
specialCounter(cc, "ActiveWatches", [self](){ return self->numWatches; });
specialCounter(cc, "WatchBytes", [self](){ return self->watchBytes; });
specialCounter(cc, "KvstoreBytesUsed", [self](){ return self->storage.getStorageBytes().used; });
specialCounter(cc, "KvstoreBytesFree", [self](){ return self->storage.getStorageBytes().free; });
specialCounter(cc, "KvstoreBytesAvailable", [self](){ return self->storage.getStorageBytes().available; });
specialCounter(cc, "KvstoreBytesTotal", [self](){ return self->storage.getStorageBytes().total; });
specialCounter(cc, "KvstoreSizeTotal", [self]() { return std::get<0>(self->storage.getSize()); });
specialCounter(cc, "KvstoreNodeTotal", [self]() { return std::get<1>(self->storage.getSize()); });
specialCounter(cc, "KvstoreInlineKey", [self]() { return std::get<2>(self->storage.getSize()); });
}
} counters;
StorageServer(IKeyValueStore* storage, Reference<AsyncVar<ServerDBInfo>> const& db, StorageServerInterface const& ssi)
: instanceID(deterministicRandom()->randomUniqueID().first()),
storage(this, storage), db(db), actors(false),
lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
rebootAfterDurableVersion(std::numeric_limits<Version>::max()),
durableInProgress(Void()),
versionLag(0), primaryLocality(tagLocalityInvalid),
updateEagerReads(0),
shardChangeCounter(0),
fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_BYTES),
shuttingDown(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), watchBytes(0), numWatches(0),
logProtocol(0), counters(this), tag(invalidTag), maxQueryQueue(0), thisServerID(ssi.id()),
readQueueSizeMetric(LiteralStringRef("StorageServer.ReadQueueSize")),
behind(false), versionBehind(false), byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), noRecentUpdates(false),
lastUpdate(now()), poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0)
{
version.initMetric(LiteralStringRef("StorageServer.Version"), counters.cc.id);
oldestVersion.initMetric(LiteralStringRef("StorageServer.OldestVersion"), counters.cc.id);
durableVersion.initMetric(LiteralStringRef("StorageServer.DurableVersion"), counters.cc.id);
desiredOldestVersion.initMetric(LiteralStringRef("StorageServer.DesiredOldestVersion"), counters.cc.id);
newestAvailableVersion.insert(allKeys, invalidVersion);
newestDirtyVersion.insert(allKeys, invalidVersion);
addShard( ShardInfo::newNotAssigned( allKeys ) );
cx = openDBOnServer(db, TaskPriority::DefaultEndpoint, true, true);
}
//~StorageServer() { fclose(log); }
// Puts the given shard into shards. The caller is responsible for adding shards
// for all ranges in shards.getAffectedRangesAfterInsertion(newShard->keys)), because these
// shards are invalidated by the call.
void addShard( ShardInfo* newShard ) {
ASSERT( !newShard->keys.empty() );
newShard->changeCounter = ++shardChangeCounter;
//TraceEvent("AddShard", this->thisServerID).detail("KeyBegin", newShard->keys.begin).detail("KeyEnd", newShard->keys.end).detail("State", newShard->isReadable() ? "Readable" : newShard->notAssigned() ? "NotAssigned" : "Adding").detail("Version", this->version.get());
/*auto affected = shards.getAffectedRangesAfterInsertion( newShard->keys, Reference<ShardInfo>() );
for(auto i = affected.begin(); i != affected.end(); ++i)
shards.insert( *i, Reference<ShardInfo>() );*/
shards.insert( newShard->keys, Reference<ShardInfo>(newShard) );
}
void addMutation(Version version, MutationRef const& mutation, KeyRangeRef const& shard, UpdateEagerReadInfo* eagerReads );
void setInitialVersion(Version ver) {
version = ver;
desiredOldestVersion = ver;
oldestVersion = ver;
durableVersion = ver;
lastVersionWithData = ver;
restoredVersion = ver;
mutableData().createNewVersion(ver);
mutableData().forgetVersionsBefore(ver);
}
// This is the maximum version that might be read from storage (the minimum version is durableVersion)
Version storageVersion() const { return oldestVersion.get(); }
bool isReadable( KeyRangeRef const& keys ) {
auto sh = shards.intersectingRanges(keys);
for(auto i = sh.begin(); i != sh.end(); ++i)
if (!i->value()->isReadable())
return false;
return true;
}
void checkChangeCounter( uint64_t oldShardChangeCounter, KeyRef const& key ) {
if (oldShardChangeCounter != shardChangeCounter &&
shards[key]->changeCounter > oldShardChangeCounter)
{
TEST(true); // shard change during getValueQ
throw wrong_shard_server();
}
}
void checkChangeCounter( uint64_t oldShardChangeCounter, KeyRangeRef const& keys ) {
if (oldShardChangeCounter != shardChangeCounter) {
auto sh = shards.intersectingRanges(keys);
for(auto i = sh.begin(); i != sh.end(); ++i)
if (i->value()->changeCounter > oldShardChangeCounter) {
TEST(true); // shard change during range operation
throw wrong_shard_server();
}
}
}
Counter::Value queueSize() {
return counters.bytesInput.getValue() - counters.bytesDurable.getValue();
}
double getPenalty() {
return std::max(std::max(1.0, (queueSize() - (SERVER_KNOBS->TARGET_BYTES_PER_STORAGE_SERVER -
2.0 * SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER)) /
SERVER_KNOBS->SPRING_BYTES_STORAGE_SERVER),
(currentRate() < 1e-6 ? 1e6 : 1.0 / currentRate()));
}
template<class Reply>
using isLoadBalancedReply = std::is_base_of<LoadBalancedReply, Reply>;
template <class Reply>
static typename std::enable_if<isLoadBalancedReply<Reply>::value, void>::type sendErrorWithPenalty(
const ReplyPromise<Reply>& promise, const Error& err, double penalty) {
Reply reply;
reply.error = err;
reply.penalty = penalty;
promise.send(reply);
}
template <class Reply>
static typename std::enable_if<!isLoadBalancedReply<Reply>::value, void>::type sendErrorWithPenalty(
const ReplyPromise<Reply>& promise, const Error& err, double) {
promise.sendError(err);
}
template<class Request, class HandleFunction>
Future<Void> readGuard(const Request& request, const HandleFunction& fun) {
auto rate = currentRate();
if (rate < SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD && deterministicRandom()->random01() > std::max(SERVER_KNOBS->STORAGE_DURABILITY_LAG_MIN_RATE, rate/SERVER_KNOBS->STORAGE_DURABILITY_LAG_REJECT_THRESHOLD)) {
//request.error = future_version();
sendErrorWithPenalty(request.reply, server_overloaded(), getPenalty());
++counters.readsRejected;
return Void();
}
return fun(this, request);
}
};
// If and only if key:=value is in (storage+versionedData), // NOT ACTUALLY: and key < allKeys.end,
// and H(key) < |key+value|/bytesPerSample,
// let sampledSize = max(|key+value|,bytesPerSample)
// persistByteSampleKeys.begin()+key := sampledSize is in storage
// (key,sampledSize) is in byteSample
// So P(key is sampled) * sampledSize == |key+value|
void StorageServer::byteSampleApplyMutation( MutationRef const& m, Version ver ){
if (m.type == MutationRef::ClearRange)
byteSampleApplyClear( KeyRangeRef(m.param1, m.param2), ver );
else if (m.type == MutationRef::SetValue)
byteSampleApplySet( KeyValueRef(m.param1, m.param2), ver );
else
ASSERT(false); // Mutation of unknown type modfying byte sample
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
/////////////////////////////////// Validation ///////////////////////////////////////
#ifndef __INTEL_COMPILER
#pragma region Validation
#endif
bool validateRange( StorageServer::VersionedData::ViewAtVersion const& view, KeyRangeRef range, Version version, UID id, Version minInsertVersion ) {
// * Nonoverlapping: No clear overlaps a set or another clear, or adjoins another clear.
// * Old mutations are erased: All items in versionedData.atLatest() have insertVersion() > durableVersion()
//TraceEvent("ValidateRange", id).detail("KeyBegin", range.begin).detail("KeyEnd", range.end).detail("Version", version);
KeyRef k;
bool ok = true;
bool kIsClear = false;
auto i = view.lower_bound(range.begin);
if (i != view.begin()) --i;
for(; i != view.end() && i.key() < range.end; ++i) {
ASSERT( i.insertVersion() > minInsertVersion );
if (kIsClear && i->isClearTo() ? i.key() <= k : i.key() < k) {
TraceEvent(SevError,"InvalidRange",id).detail("Key1", k).detail("Key2", i.key()).detail("Version", version);
ok = false;
}
//ASSERT( i.key() >= k );
kIsClear = i->isClearTo();
k = kIsClear ? i->getEndKey() : i.key();
}
return ok;
}
void validate(StorageServer* data, bool force = false) {
try {
if (force || (EXPENSIVE_VALIDATION)) {
data->newestAvailableVersion.validateCoalesced();
data->newestDirtyVersion.validateCoalesced();
for(auto s = data->shards.ranges().begin(); s != data->shards.ranges().end(); ++s) {
ASSERT( s->value()->keys == s->range() );
ASSERT( !s->value()->keys.empty() );
}
for(auto s = data->shards.ranges().begin(); s != data->shards.ranges().end(); ++s)
if (s->value()->isReadable()) {
auto ar = data->newestAvailableVersion.intersectingRanges(s->range());
for(auto a = ar.begin(); a != ar.end(); ++a)
ASSERT( a->value() == latestVersion );
}
// * versionedData contains versions [storageVersion(), version.get()]. It might also contain version (version.get()+1), in which changeDurableVersion may be deleting ghosts, and/or it might
// contain later versions if applyUpdate is on the stack.
ASSERT( data->data().getOldestVersion() == data->storageVersion() );
ASSERT( data->data().getLatestVersion() == data->version.get() || data->data().getLatestVersion() == data->version.get()+1 || (data->debug_inApplyUpdate && data->data().getLatestVersion() > data->version.get()) );
auto latest = data->data().atLatest();
// * Old shards are erased: versionedData.atLatest() has entries (sets or clear *begins*) only for keys in readable or adding,transferred shards.
for(auto s = data->shards.ranges().begin(); s != data->shards.ranges().end(); ++s) {
ShardInfo* shard = s->value().getPtr();
if (!shard->isInVersionedData()) {
if (latest.lower_bound(s->begin()) != latest.lower_bound(s->end())) {
TraceEvent(SevError, "VF", data->thisServerID).detail("LastValidTime", data->debug_lastValidateTime).detail("KeyBegin", s->begin()).detail("KeyEnd", s->end())
.detail("FirstKey", latest.lower_bound(s->begin()).key()).detail("FirstInsertV", latest.lower_bound(s->begin()).insertVersion());
}
ASSERT( latest.lower_bound(s->begin()) == latest.lower_bound(s->end()) );
}
}
latest.validate();
validateRange(latest, allKeys, data->version.get(), data->thisServerID, data->durableVersion.get());
data->debug_lastValidateTime = now();
}
} catch (...) {
TraceEvent(SevError, "ValidationFailure", data->thisServerID).detail("LastValidTime", data->debug_lastValidateTime);
throw;
}
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
void
updateProcessStats(StorageServer* self)
{
if (g_network->isSimulated()) {
// diskUsage and cpuUsage are not relevant in the simulator,
// and relying on the actual values could break seed determinism
self->cpuUsage = 100.0;
self->diskUsage = 100.0;
return;
}
SystemStatistics sysStats = getSystemStatistics();
if (sysStats.initialized) {
self->cpuUsage = 100 * sysStats.processCPUSeconds / sysStats.elapsed;
self->diskUsage = 100 * std::max(0.0, (sysStats.elapsed - sysStats.processDiskIdleSeconds) / sysStats.elapsed);
}
}
///////////////////////////////////// Queries /////////////////////////////////
#ifndef __INTEL_COMPILER
#pragma region Queries
#endif
ACTOR Future<Version> waitForVersionActor(StorageServer* data, Version version, SpanID spanContext) {
state Span span("SS.WaitForVersion"_loc, { spanContext });
choose {
when(wait(data->version.whenAtLeast(version))) {
// FIXME: A bunch of these can block with or without the following delay 0.
// wait( delay(0) ); // don't do a whole bunch of these at once
if (version < data->oldestVersion.get()) throw transaction_too_old(); // just in case
return version;
}
when(wait(delay(SERVER_KNOBS->FUTURE_VERSION_DELAY))) {
if (deterministicRandom()->random01() < 0.001)
TraceEvent(SevWarn, "ShardServerFutureVersion1000x", data->thisServerID)
.detail("Version", version)
.detail("MyVersion", data->version.get())
.detail("ServerID", data->thisServerID);
throw future_version();
}
}
}
Future<Version> waitForVersion(StorageServer* data, Version version, SpanID spanContext) {
if (version == latestVersion) {
version = std::max(Version(1), data->version.get());
}
if (version < data->oldestVersion.get() || version <= 0) {
return transaction_too_old();
} else if (version <= data->version.get()) {
return version;
}
if ((data->behind || data->versionBehind) && version > data->version.get()) {
return process_behind();
}
if (deterministicRandom()->random01() < 0.001) {
TraceEvent("WaitForVersion1000x");
}
return waitForVersionActor(data, version, spanContext);
}
ACTOR Future<Version> waitForVersionNoTooOld( StorageServer* data, Version version ) {
// This could become an Actor transparently, but for now it just does the lookup
if (version == latestVersion)
version = std::max(Version(1), data->version.get());
if (version <= data->version.get())
return version;
choose {
when ( wait( data->version.whenAtLeast(version) ) ) {
return version;
}
when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) {
if(deterministicRandom()->random01() < 0.001)
TraceEvent(SevWarn, "ShardServerFutureVersion1000x", data->thisServerID)
.detail("Version", version)
.detail("MyVersion", data->version.get())
.detail("ServerID", data->thisServerID);
throw future_version();
}
}
}
ACTOR Future<Void> getValueQ( StorageServer* data, GetValueRequest req ) {
state int64_t resultSize = 0;
Span span("SS:getValue"_loc, { req.spanContext });
try {
++data->counters.getValueQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
// so we need to downgrade here
wait( delay(0, TaskPriority::DefaultEndpoint) );
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask());
state Optional<Value> v;
state Version version = wait( waitForVersion( data, req.version, req.spanContext ) );
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask());
state uint64_t changeCounter = data->shardChangeCounter;
if (!data->shards[req.key]->isReadable()) {
//TraceEvent("WrongShardServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ");
throw wrong_shard_server();
}
state int path = 0;
auto i = data->data().at(version).lastLessOrEqual(req.key);
if (i && i->isValue() && i.key() == req.key) {
v = (Value)i->getValue();
path = 1;
} else if (!i || !i->isClearTo() || i->getEndKey() <= req.key) {
path = 2;
Optional<Value> vv = wait( data->storage.readValue( req.key, req.debugID ) );
// Validate that while we were reading the data we didn't lose the version or shard
if (version < data->storageVersion()) {
TEST(true); // transaction_too_old after readValue
throw transaction_too_old();
}
data->checkChangeCounter(changeCounter, req.key);
v = vv;
}
DEBUG_MUTATION("ShardGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present()?v.get():LiteralStringRef("<null>")));
DEBUG_MUTATION("ShardGetPath", version, MutationRef(MutationRef::DebugKey, req.key, path==0?LiteralStringRef("0"):path==1?LiteralStringRef("1"):LiteralStringRef("2")));
/*
StorageMetrics m;
m.bytesPerKSecond = req.key.size() + (v.present() ? v.get().size() : 0);
m.iosPerKSecond = 1;
data->metrics.notify(req.key, m);
*/
if (v.present()) {
++data->counters.rowsQueried;
resultSize = v.get().size();
data->counters.bytesQueried += resultSize;
}
else {
++data->counters.emptyQueries;
}
if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
// If the read yields no value, randomly sample the empty read.
int64_t bytesReadPerKSecond =
v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), SERVER_KNOBS->EMPTY_READ_PENALTY)
: SERVER_KNOBS->EMPTY_READ_PENALTY;
data->metrics.notifyBytesReadPerKSecond(req.key, bytesReadPerKSecond);
}
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());
// Check if the desired key might be cached
auto cached = data->cachedRangeMap[req.key];
//if (cached)
// TraceEvent(SevDebug, "SSGetValueCached").detail("Key", req.key);
GetValueReply reply(v, cached);
reply.penalty = data->getPenalty();
req.reply.send(reply);
} catch (Error& e) {
if(!canReplyWith(e))
throw;
data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
}
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
if(data->latencyBandConfig.present()) {
int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
data->counters.readLatencyBands.addMeasurement(timer() - req.requestTime(), resultSize > maxReadBytes);
}
return Void();
};
ACTOR Future<Void> watchValue_impl( StorageServer* data, WatchValueRequest req, SpanID parent ) {
state Location spanLocation = "SS:WatchValueImpl"_loc;
state Span span(spanLocation, { parent });
try {
++data->counters.watchQueries;
if( req.debugID.present() )
g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.Before"); //.detail("TaskID", g_network->getCurrentTask());
wait(success(waitForVersionNoTooOld(data, req.version)));
if( req.debugID.present() )
g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask());
state Version minVersion = data->data().latestVersion;
state Future<Void> watchFuture = data->watches.onChange(req.key);
loop {
try {
state Version latest = data->version.get();
TEST(latest >= minVersion && latest < data->data().latestVersion); // Starting watch loop with latestVersion > data->version
GetValueRequest getReq( span.context, req.key, latest, req.tags, req.debugID );
state Future<Void> getValue = getValueQ( data, getReq ); //we are relying on the delay zero at the top of getValueQ, if removed we need one here
GetValueReply reply = wait( getReq.reply.getFuture() );
span = Span(spanLocation, parent);
//TraceEvent("WatcherCheckValue").detail("Key", req.key ).detail("Value", req.value ).detail("CurrentValue", v ).detail("Ver", latest);
if(reply.error.present()) {
ASSERT(reply.error.get().code() != error_code_future_version);
throw reply.error.get();
}
if(BUGGIFY) {
throw transaction_too_old();
}
DEBUG_MUTATION("ShardWatchValue", latest, MutationRef(MutationRef::DebugKey, req.key, reply.value.present() ? StringRef( reply.value.get() ) : LiteralStringRef("<null>") ) );
if( req.debugID.present() )
g_traceBatch.addEvent("WatchValueDebug", req.debugID.get().first(), "watchValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask());
if( reply.value != req.value ) {
req.reply.send(WatchValueReply{ latest });
return Void();
}
if( data->watchBytes > SERVER_KNOBS->MAX_STORAGE_SERVER_WATCH_BYTES ) {
TEST(true); //Too many watches, reverting to polling
data->sendErrorWithPenalty(req.reply, watch_cancelled(), data->getPenalty());
return Void();
}
++data->numWatches;
data->watchBytes += ( req.key.expectedSize() + req.value.expectedSize() + 1000 );
try {
if(latest < minVersion) {
// If the version we read is less than minVersion, then we may fail to be notified of any changes that occur up to or including minVersion
// To prevent that, we'll check the key again once the version reaches our minVersion
watchFuture = watchFuture || data->version.whenAtLeast(minVersion);
}
if(BUGGIFY) {
// Simulate a trigger on the watch that results in the loop going around without the value changing
watchFuture = watchFuture || delay(deterministicRandom()->random01());
}
wait(watchFuture);
--data->numWatches;
data->watchBytes -= ( req.key.expectedSize() + req.value.expectedSize() + 1000 );
} catch( Error &e ) {
--data->numWatches;
data->watchBytes -= ( req.key.expectedSize() + req.value.expectedSize() + 1000 );
throw;
}
} catch( Error &e ) {
if( e.code() != error_code_transaction_too_old ) {
throw;
}
TEST(true); // Reading a watched key failed with transaction_too_old
}
watchFuture = data->watches.onChange(req.key);
wait(data->version.whenAtLeast(data->data().latestVersion));
}
} catch (Error& e) {
if(!canReplyWith(e))
throw;
data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
}
return Void();
}
ACTOR Future<Void> watchValueQ( StorageServer* data, WatchValueRequest req ) {
state Span span("SS:watchValue"_loc, { req.spanContext });
state Future<Void> watch = watchValue_impl( data, req, span.context );
state double startTime = now();
loop {
double timeoutDelay = -1;
if(data->noRecentUpdates.get()) {
timeoutDelay = std::max(CLIENT_KNOBS->FAST_WATCH_TIMEOUT - (now() - startTime), 0.0);
} else if(!BUGGIFY) {
timeoutDelay = std::max(CLIENT_KNOBS->WATCH_TIMEOUT - (now() - startTime), 0.0);
}
choose {
when( wait( watch ) ) {
return Void();
}
when( wait( timeoutDelay < 0 ? Never() : delay(timeoutDelay) ) ) {
data->sendErrorWithPenalty(req.reply, timed_out(), data->getPenalty());
return Void();
}
when( wait( data->noRecentUpdates.onChange()) ) {}
}
}
}
ACTOR Future<Void> getShardState_impl( StorageServer* data, GetShardStateRequest req ) {
ASSERT( req.mode != GetShardStateRequest::NO_WAIT );
loop {
std::vector<Future<Void>> onChange;
for( auto t : data->shards.intersectingRanges( req.keys ) ) {
if( !t.value()->assigned() ) {
onChange.push_back( delay( SERVER_KNOBS->SHARD_READY_DELAY ) );
break;
}
if( req.mode == GetShardStateRequest::READABLE && !t.value()->isReadable() )
onChange.push_back( t.value()->adding->readWrite.getFuture() );
if( req.mode == GetShardStateRequest::FETCHING && !t.value()->isFetched() )
onChange.push_back( t.value()->adding->fetchComplete.getFuture() );
}
if( !onChange.size() ) {
req.reply.send(GetShardStateReply{ data->version.get(), data->durableVersion.get() });
return Void();
}
wait( waitForAll( onChange ) );
wait( delay(0) ); //onChange could have been triggered by cancellation, let things settle before rechecking
}
}
ACTOR Future<Void> getShardStateQ( StorageServer* data, GetShardStateRequest req ) {
choose {
when( wait( getShardState_impl( data, req ) ) ) {}
when( wait( delay( g_network->isSimulated() ? 10 : 60 ) ) ) {
data->sendErrorWithPenalty(req.reply, timed_out(), data->getPenalty());
}
}
return Void();
}
void merge( Arena& arena, VectorRef<KeyValueRef, VecSerStrategy::String>& output,
VectorRef<KeyValueRef> const& vm_output,
VectorRef<KeyValueRef> const& base,
int& vCount, int limit, bool stopAtEndOfBase, int& pos, int limitBytes = 1<<30 )
// Combines data from base (at an older version) with sets from newer versions in [start, end) and appends the first (up to) |limit| rows to output
// If limit<0, base and output are in descending order, and start->key()>end->key(), but start is still inclusive and end is exclusive
{
ASSERT(limit != 0);
bool forward = limit>0;
if (!forward) limit = -limit;
int adjustedLimit = limit + output.size();
int accumulatedBytes = 0;
KeyValueRef const* baseStart = base.begin();
KeyValueRef const* baseEnd = base.end();
while (baseStart!=baseEnd && vCount>0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
if (forward ? baseStart->key < vm_output[pos].key : baseStart->key > vm_output[pos].key) {
output.push_back_deep( arena, *baseStart++ );
}
else {
output.push_back_deep( arena, vm_output[pos]);
if (baseStart->key == vm_output[pos].key) ++baseStart;
++pos;
vCount--;
}
accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
}
while (baseStart!=baseEnd && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
output.push_back_deep( arena, *baseStart++ );
accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
}
if( !stopAtEndOfBase ) {
while (vCount>0 && output.size() < adjustedLimit && accumulatedBytes < limitBytes) {
output.push_back_deep( arena, vm_output[pos]);
accumulatedBytes += sizeof(KeyValueRef) + output.end()[-1].expectedSize();
++pos;
vCount--;
}
}
}
// If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending).
// readRange has O(|result|) + O(log |data|) cost
ACTOR Future<GetKeyValuesReply> readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes, SpanID parentSpan ) {
state GetKeyValuesReply result;
state StorageServer::VersionedData::ViewAtVersion view = data->data().at(version);
state StorageServer::VersionedData::iterator vCurrent = view.end();
state KeyRef readBegin;
state KeyRef readEnd;
state Key readBeginTemp;
state int vCount = 0;
state Span span("SS:readRange"_loc, parentSpan);
// for caching the storage queue results during the first PTree traversal
state VectorRef<KeyValueRef> resultCache;
// for remembering the position in the resultCache
state int pos = 0;
// Check if the desired key-range is cached
auto containingRange = data->cachedRangeMap.rangeContaining(range.begin);
if (containingRange.value() && containingRange->range().end >= range.end) {
//TraceEvent(SevDebug, "SSReadRangeCached").detail("Size",data->cachedRangeMap.size()).detail("ContainingRangeBegin",containingRange->range().begin).detail("ContainingRangeEnd",containingRange->range().end).
// detail("Begin", range.begin).detail("End",range.end);
result.cached = true;
} else
result.cached = false;
// if (limit >= 0) we are reading forward, else backward
if (limit >= 0) {
// We might care about a clear beginning before start that
// runs into range
vCurrent = view.lastLessOrEqual(range.begin);
if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > range.begin)
readBegin = vCurrent->getEndKey();
else
readBegin = range.begin;
vCurrent = view.lower_bound(readBegin);
while (limit>0 && *pLimitBytes>0 && readBegin < range.end) {
ASSERT( !vCurrent || vCurrent.key() >= readBegin );
ASSERT( data->storageVersion() <= version );
/* Traverse the PTree further, if thare are no unconsumed resultCache items */
if (pos == resultCache.size()) {
if (vCurrent) {
auto b = vCurrent;
--b;
ASSERT(!b || b.key() < readBegin);
}
// Read up to limit items from the view, stopping at the next clear (or the end of the range)
int vSize = 0;
while (vCurrent && vCurrent.key() < range.end && !vCurrent->isClearTo() && vCount < limit &&
vSize < *pLimitBytes) {
// Store the versionedData results in resultCache
resultCache.push_back(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue()));
vSize += sizeof(KeyValueRef) + resultCache.cback().expectedSize();
++vCount;
++vCurrent;
}
}
// Read the data on disk up to vCurrent (or the end of the range)
readEnd = vCurrent ? std::min( vCurrent.key(), range.end ) : range.end;
Standalone<RangeResultRef> atStorageVersion = wait(
data->storage.readRange( KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes ) );
ASSERT( atStorageVersion.size() <= limit );
if (data->storageVersion() > version) throw transaction_too_old();
// merge the sets in resultCache with the sets on disk, stopping at the last key from disk if there is 'more'
int prevSize = result.data.size();
merge( result.arena, result.data, resultCache,
atStorageVersion, vCount, limit, atStorageVersion.more, pos, *pLimitBytes );
limit -= result.data.size() - prevSize;
for (auto i = result.data.begin() + prevSize; i != result.data.end(); i++) {
*pLimitBytes -= sizeof(KeyValueRef) + i->expectedSize();
}
if (limit <=0 || *pLimitBytes <= 0) {
break;
}
// Setup for the next iteration
// If we hit our limits reading from disk but then combining with MVCC gave us back more room
if (atStorageVersion.more) { // if there might be more data, begin reading right after what we already found to find out
ASSERT(result.data.end()[-1].key == atStorageVersion.end()[-1].key);
readBegin = readBeginTemp = keyAfter( result.data.end()[-1].key );
} else if (vCurrent && vCurrent->isClearTo()){ // if vCurrent is a clear, skip it.
ASSERT(vCurrent->getEndKey() > readBegin);
readBegin = vCurrent->getEndKey(); // next disk read should start at the end of the clear
++vCurrent;
} else {
ASSERT(readEnd == range.end);
break;
}
}
} else {
vCurrent = view.lastLess(range.end);
// A clear might extend all the way to range.end
if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() >= range.end) {
readEnd = vCurrent.key();
--vCurrent;
} else {
readEnd = range.end;
}
while (limit < 0 && *pLimitBytes > 0 && readEnd > range.begin) {
ASSERT(!vCurrent || vCurrent.key() < readEnd);
ASSERT(data->storageVersion() <= version);
/* Traverse the PTree further, if thare are no unconsumed resultCache items */
if (pos == resultCache.size()) {
if (vCurrent) {
auto b = vCurrent;
++b;
ASSERT(!b || b.key() >= readEnd);
}
vCount = 0;
int vSize = 0;
while (vCurrent && vCurrent.key() >= range.begin && !vCurrent->isClearTo() && vCount < -limit &&
vSize < *pLimitBytes) {
// Store the versionedData results in resultCache
resultCache.push_back(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue()));
vSize += sizeof(KeyValueRef) + resultCache.cback().expectedSize();
++vCount;
--vCurrent;
}
}
readBegin = vCurrent ? std::max(vCurrent->isClearTo() ? vCurrent->getEndKey() : vCurrent.key(), range.begin) : range.begin;
Standalone<RangeResultRef> atStorageVersion =
wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes));
ASSERT(atStorageVersion.size() <= -limit);
if (data->storageVersion() > version) throw transaction_too_old();
int prevSize = result.data.size();
merge( result.arena, result.data, resultCache,
atStorageVersion, vCount, limit, atStorageVersion.more, pos, *pLimitBytes );
limit += result.data.size() - prevSize;
for (auto i = result.data.begin() + prevSize; i != result.data.end(); i++) {
*pLimitBytes -= sizeof(KeyValueRef) + i->expectedSize();
}
if (limit >=0 || *pLimitBytes <= 0) {
break;
}
if (atStorageVersion.more) {
ASSERT(result.data.end()[-1].key == atStorageVersion.end()[-1].key);
readEnd = result.data.end()[-1].key;
} else if (vCurrent && vCurrent->isClearTo()) {
ASSERT(vCurrent.key() < readEnd);
readEnd = vCurrent.key();
--vCurrent;
} else {
ASSERT(readBegin == range.begin);
break;
}
}
}
// all but the last item are less than *pLimitBytes
ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0);
result.more = limit == 0 || *pLimitBytes<=0; // FIXME: Does this have to be exact?
result.version = version;
return result;
}
//bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) {
// Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef
// return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end);
//}
ACTOR Future<Key> findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset, SpanID parentSpan)
// Attempts to find the key indicated by sel in the data at version, within range.
// Precondition: selectorInRange(sel, range)
// If it is found, offset is set to 0 and a key is returned which falls inside range.
// If the search would depend on any key outside range OR if the key selector offset is too large (range read returns too many bytes), it returns either
// a negative offset and a key in [range.begin, sel.getKey()], indicating the key is (the first key <= returned key) + offset, or
// a positive offset and a key in (sel.getKey(), range.end], indicating the key is (the first key >= returned key) + offset-1
// The range passed in to this function should specify a shard. If range.begin is repeatedly not the beginning of a shard, then it is possible to get stuck looping here
{
ASSERT( version != latestVersion );
ASSERT( selectorInRange(sel, range) && version >= data->oldestVersion.get() );
// Count forward or backward distance items, skipping the first one if it == key and skipEqualKey
state bool forward = sel.offset > 0; // If forward, result >= sel.getKey(); else result <= sel.getKey()
state int sign = forward ? +1 : -1;
state bool skipEqualKey = sel.orEqual == forward;
state int distance = forward ? sel.offset : 1-sel.offset;
state Span span("SS.findKey"_loc, { parentSpan });
//Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case)
state int maxBytes;
if (sel.offset <= 1 && sel.offset >= 0)
maxBytes = std::numeric_limits<int>::max();
else
maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES;
state GetKeyValuesReply rep = wait(
readRange(data, version,
forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())),
(distance + skipEqualKey) * sign, &maxBytes, span.context));
state bool more = rep.more && rep.data.size() != distance + skipEqualKey;
//If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop
if(more && !forward && rep.data.size() == 1) {
TEST(true); //Reverse key selector returned only one result in range read
maxBytes = std::numeric_limits<int>::max();
GetKeyValuesReply rep2 =
wait(readRange(data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes, span.context));
rep = rep2;
more = rep.more && rep.data.size() != distance + skipEqualKey;
ASSERT(rep.data.size() == 2 || !more);
}
int index = distance-1;
if (skipEqualKey && rep.data.size() && rep.data[0].key == sel.getKey() )
++index;
if (index < rep.data.size()) {
*pOffset = 0;
if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
int64_t bytesReadPerKSecond =
std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->EMPTY_READ_PENALTY);
data->metrics.notifyBytesReadPerKSecond(sel.getKey(), bytesReadPerKSecond);
}
return rep.data[ index ].key;
} else {
if (SERVER_KNOBS->READ_SAMPLING_ENABLED) {
int64_t bytesReadPerKSecond = SERVER_KNOBS->EMPTY_READ_PENALTY;
data->metrics.notifyBytesReadPerKSecond(sel.getKey(), bytesReadPerKSecond);
}
// FIXME: If range.begin=="" && !forward, return success?
*pOffset = index - rep.data.size() + 1;
if (!forward) *pOffset = -*pOffset;
if (more) {
TEST(true); // Key selector read range had more results
ASSERT(rep.data.size());
Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key;
//This is possible if key/value pairs are very large and only one result is returned on a last less than query
//SOMEDAY: graceful handling of exceptionally sized values
ASSERT(returnKey != sel.getKey());
return returnKey;
} else
return forward ? range.end : range.begin;
}
}
KeyRange getShardKeyRange( StorageServer* data, const KeySelectorRef& sel )
// Returns largest range such that the shard state isReadable and selectorInRange(sel, range) or wrong_shard_server if no such range exists
{
auto i = sel.isBackward() ? data->shards.rangeContainingKeyBefore( sel.getKey() ) : data->shards.rangeContaining( sel.getKey() );
if (!i->value()->isReadable()) throw wrong_shard_server();
ASSERT( selectorInRange(sel, i->range()) );
return i->range();
}
ACTOR Future<Void> getKeyValuesQ( StorageServer* data, GetKeyValuesRequest req )
// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents
// all data from being read in one range read
{
state Span span("SS:getKeyValues"_loc, { req.spanContext });
state int64_t resultSize = 0;
++data->counters.getRangeQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
// so we need to downgrade here
if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) {
wait( delay(0, TaskPriority::FetchKeys) );
// } else if (false) {
// // Placeholder for up-prioritizing fetches for important requests
// taskType = TaskPriority::DefaultDelay;
} else {
wait( delay(0, TaskPriority::DefaultEndpoint) );
}
try {
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Before");
state Version version = wait( waitForVersion( data, req.version, span.context ) );
state uint64_t changeCounter = data->shardChangeCounter;
// try {
state KeyRange shard = getShardKeyRange( data, req.begin );
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterVersion");
//.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
//} catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; }
if ( !selectorInRange(req.end, shard) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == shard.end) ) {
// TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents");
throw wrong_shard_server();
}
state int offset1;
state int offset2;
state Future<Key> fBegin = req.begin.isFirstGreaterOrEqual()
? Future<Key>(req.begin.getKey())
: findKey(data, req.begin, version, shard, &offset1, span.context);
state Future<Key> fEnd = req.end.isFirstGreaterOrEqual()
? Future<Key>(req.end.getKey())
: findKey(data, req.end, version, shard, &offset2, span.context);
state Key begin = wait(fBegin);
state Key end = wait(fEnd);
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterKeys");
//.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey());
// Offsets of zero indicate begin/end keys in this shard, which obviously means we can answer the query
// An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next shard is the end the last actual key returned must be from this shard.
// A begin offset of 1 is also OK because then either begin is past end or equal to end (so the result is definitely empty)
if ((offset1 && offset1!=1) || (offset2 && offset2!=1)) {
TEST(true); // wrong_shard_server due to offset
// We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, and return a clipped range rather
// than an error (since that is what the NativeAPI.getRange will do anyway via its "slow path"), but we would have to add some flags to the response
// to encode whether we went off the beginning and the end, since it needs that information.
//TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2);
throw wrong_shard_server();
}
if (begin >= end) {
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.Send");
//.detail("Begin",begin).detail("End",end);
GetKeyValuesReply none;
none.version = version;
none.more = false;
none.penalty = data->getPenalty();
data->checkChangeCounter( changeCounter, KeyRangeRef( std::min<KeyRef>(req.begin.getKey(), req.end.getKey()), std::max<KeyRef>(req.begin.getKey(), req.end.getKey()) ) );
req.reply.send( none );
} else {
state int remainingLimitBytes = req.limitBytes;
GetKeyValuesReply _r = wait( readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes, span.context) );
GetKeyValuesReply r = _r;
if( req.debugID.present() )
g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storageserver.getKeyValues.AfterReadRange");
//.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size());
data->checkChangeCounter( changeCounter, KeyRangeRef( std::min<KeyRef>(begin, std::min<KeyRef>(req.begin.getKey(), req.end.getKey())), std::max<KeyRef>(end, std::max<KeyRef>(req.begin.getKey(), req.end.getKey())) ) );
if (EXPENSIVE_VALIDATION) {
for (int i = 0; i < r.data.size(); i++)
ASSERT(r.data[i].key >= begin && r.data[i].key < end);
ASSERT(r.data.size() <= std::abs(req.limit));
}
/*for( int i = 0; i < r.data.size(); i++ ) {
StorageMetrics m;
m.bytesPerKSecond = r.data[i].expectedSize();
m.iosPerKSecond = 1; //FIXME: this should be 1/r.data.size(), but we cannot do that because it is an int
data->metrics.notify(r.data[i].key, m);
}*/
// For performance concerns, the cost of a range read is billed to the start key and end key of the range.
int64_t totalByteSize = 0;
for (int i = 0; i < r.data.size(); i++) {
totalByteSize += r.data[i].expectedSize();
}
if (totalByteSize > 0 && SERVER_KNOBS->READ_SAMPLING_ENABLED) {
int64_t bytesReadPerKSecond = std::max(totalByteSize, SERVER_KNOBS->EMPTY_READ_PENALTY) / 2;
data->metrics.notifyBytesReadPerKSecond(r.data[0].key, bytesReadPerKSecond);
data->metrics.notifyBytesReadPerKSecond(r.data[r.data.size() - 1].key, bytesReadPerKSecond);
}
r.penalty = data->getPenalty();
req.reply.send( r );
resultSize = req.limitBytes - remainingLimitBytes;
data->counters.bytesQueried += resultSize;
data->counters.rowsQueried += r.data.size();
if(r.data.size() == 0) {
++data->counters.emptyQueries;
}
}
} catch (Error& e) {
if(!canReplyWith(e))
throw;
data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
}
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
if(data->latencyBandConfig.present()) {
int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
int maxSelectorOffset = data->latencyBandConfig.get().readConfig.maxKeySelectorOffset.orDefault(std::numeric_limits<int>::max());
data->counters.readLatencyBands.addMeasurement(
timer() - req.requestTime(), resultSize > maxReadBytes || abs(req.begin.offset) > maxSelectorOffset ||
abs(req.end.offset) > maxSelectorOffset);
}
return Void();
}
ACTOR Future<Void> getKeyQ( StorageServer* data, GetKeyRequest req ) {
state Span span("SS:getKey"_loc, { req.spanContext });
state int64_t resultSize = 0;
++data->counters.getKeyQueries;
++data->counters.allQueries;
++data->readQueueSizeMetric;
data->maxQueryQueue = std::max<int>( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue());
// Active load balancing runs at a very high priority (to obtain accurate queue lengths)
// so we need to downgrade here
wait( delay(0, TaskPriority::DefaultEndpoint) );
try {
state Version version = wait( waitForVersion( data, req.version, req.spanContext ) );
state uint64_t changeCounter = data->shardChangeCounter;
state KeyRange shard = getShardKeyRange( data, req.sel );
state int offset;
Key k = wait( findKey( data, req.sel, version, shard, &offset, req.spanContext ) );
data->checkChangeCounter( changeCounter, KeyRangeRef( std::min<KeyRef>(req.sel.getKey(), k), std::max<KeyRef>(req.sel.getKey(), k) ) );
KeySelector updated;
if (offset < 0)
updated = firstGreaterOrEqual(k)+offset; // first thing on this shard OR (large offset case) smallest key retrieved in range read
else if (offset > 0)
updated = firstGreaterOrEqual(k)+offset-1; // first thing on next shard OR (large offset case) keyAfter largest key retrieved in range read
else
updated = KeySelectorRef(k,true,0); //found
resultSize = k.size();
data->counters.bytesQueried += resultSize;
++data->counters.rowsQueried;
// Check if the desired key might be cached
auto cached = data->cachedRangeMap[k];
//if (cached)
// TraceEvent(SevDebug, "SSGetKeyCached").detail("Key", k).detail("Begin", shard.begin.printable()).detail("End", shard.end.printable());
GetKeyReply reply(updated, cached);
reply.penalty = data->getPenalty();
req.reply.send(reply);
}
catch (Error& e) {
//if (e.code() == error_code_wrong_shard_server) TraceEvent("WrongShardServer").detail("In","getKey");
if(!canReplyWith(e))
throw;
data->sendErrorWithPenalty(req.reply, e, data->getPenalty());
}
// SOMEDAY: The size reported here is an undercount of the bytes read due to the fact that we have to scan for the key
// It would be more accurate to count all the read bytes, but it's not critical because this function is only used if
// read-your-writes is disabled
data->transactionTagCounter.addRequest(req.tags, resultSize);
++data->counters.finishedQueries;
--data->readQueueSizeMetric;
if(data->latencyBandConfig.present()) {
int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits<int>::max());
int maxSelectorOffset = data->latencyBandConfig.get().readConfig.maxKeySelectorOffset.orDefault(std::numeric_limits<int>::max());
data->counters.readLatencyBands.addMeasurement(
timer() - req.requestTime(), resultSize > maxReadBytes || abs(req.sel.offset) > maxSelectorOffset);
}
return Void();
}
void getQueuingMetrics( StorageServer* self, StorageQueuingMetricsRequest const& req ) {
StorageQueuingMetricsReply reply;
reply.localTime = now();
reply.instanceID = self->instanceID;
reply.bytesInput = self->counters.bytesInput.getValue();
reply.bytesDurable = self->counters.bytesDurable.getValue();
reply.storageBytes = self->storage.getStorageBytes();
reply.localRateLimit = self->currentRate();
reply.version = self->version.get();
reply.cpuUsage = self->cpuUsage;
reply.diskUsage = self->diskUsage;
reply.durableVersion = self->durableVersion.get();
Optional<StorageServer::TransactionTagCounter::TagInfo> busiestTag = self->transactionTagCounter.getBusiestTag();
reply.busiestTag = busiestTag.map<TransactionTag>([](StorageServer::TransactionTagCounter::TagInfo tagInfo) { return tagInfo.tag; });
reply.busiestTagFractionalBusyness = busiestTag.present() ? busiestTag.get().fractionalBusyness : 0.0;
reply.busiestTagRate = busiestTag.present() ? busiestTag.get().rate : 0.0;
req.reply.send( reply );
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
/////////////////////////// Updates ////////////////////////////////
#ifndef __INTEL_COMPILER
#pragma region Updates
#endif
ACTOR Future<Void> doEagerReads( StorageServer* data, UpdateEagerReadInfo* eager ) {
eager->finishKeyBegin();
vector<Future<Key>> keyEnd( eager->keyBegin.size() );
for(int i=0; i<keyEnd.size(); i++)
keyEnd[i] = data->storage.readNextKeyInclusive( eager->keyBegin[i] );
state Future<vector<Key>> futureKeyEnds = getAll(keyEnd);
vector<Future<Optional<Value>>> value( eager->keys.size() );
for(int i=0; i<value.size(); i++)
value[i] = data->storage.readValuePrefix( eager->keys[i].first, eager->keys[i].second );
state Future<vector<Optional<Value>>> futureValues = getAll(value);
state vector<Key> keyEndVal = wait( futureKeyEnds );
vector<Optional<Value>> optionalValues = wait ( futureValues);
eager->keyEnd = keyEndVal;
eager->value = optionalValues;
return Void();
}
bool changeDurableVersion( StorageServer* data, Version desiredDurableVersion ) {
// Remove entries from the latest version of data->versionedData that haven't changed since they were inserted
// before or at desiredDurableVersion, to maintain the invariants for versionedData.
// Such entries remain in older versions of versionedData until they are forgotten, because it is expensive to dig them out.
// We also remove everything up to and including newDurableVersion from mutationLog, and everything
// up to but excluding desiredDurableVersion from freeable
// May return false if only part of the work has been done, in which case the caller must call again with the same parameters
auto& verData = data->mutableData();
ASSERT( verData.getLatestVersion() == data->version.get() || verData.getLatestVersion() == data->version.get()+1 );
Version nextDurableVersion = desiredDurableVersion;
auto mlv = data->getMutationLog().begin();
if (mlv != data->getMutationLog().end() && mlv->second.version <= desiredDurableVersion) {
auto& v = mlv->second;
nextDurableVersion = v.version;
data->freeable[ data->version.get() ].dependsOn( v.arena() );
if (verData.getLatestVersion() <= data->version.get())
verData.createNewVersion( data->version.get()+1 );
int64_t bytesDurable = VERSION_OVERHEAD;
for(auto m = v.mutations.begin(); m; ++m) {
bytesDurable += mvccStorageBytes(*m);
auto i = verData.atLatest().find(m->param1);
if (i) {
ASSERT( i.key() == m->param1 );
ASSERT( i.insertVersion() >= nextDurableVersion );
if (i.insertVersion() == nextDurableVersion)
verData.erase(i);
}
if (m->type == MutationRef::SetValue) {
// A set can split a clear, so there might be another entry immediately after this one that should also be cleaned up
i = verData.atLatest().upper_bound(m->param1);
if (i) {
ASSERT( i.insertVersion() >= nextDurableVersion );
if (i.insertVersion() == nextDurableVersion)
verData.erase(i);
}
}
}
data->counters.bytesDurable += bytesDurable;
}
if (EXPENSIVE_VALIDATION) {
// Check that the above loop did its job
auto view = data->data().atLatest();
for(auto i = view.begin(); i != view.end(); ++i)
ASSERT( i.insertVersion() > nextDurableVersion );
}
data->getMutableMutationLog().erase(data->getMutationLog().begin(), data->getMutationLog().upper_bound(nextDurableVersion));
data->freeable.erase( data->freeable.begin(), data->freeable.lower_bound(nextDurableVersion) );
Future<Void> checkFatalError = data->otherError.getFuture();
data->durableVersion.set( nextDurableVersion );
setDataDurableVersion(data->thisServerID, data->durableVersion.get());
if (checkFatalError.isReady()) checkFatalError.get();
//TraceEvent("ForgotVersionsBefore", data->thisServerID).detail("Version", nextDurableVersion);
validate(data);
return nextDurableVersion == desiredDurableVersion;
}
Optional<MutationRef> clipMutation( MutationRef const& m, KeyRangeRef range ) {
if (isSingleKeyMutation((MutationRef::Type) m.type)) {
if (range.contains(m.param1)) return m;
}
else if (m.type == MutationRef::ClearRange) {
KeyRangeRef i = range & KeyRangeRef(m.param1, m.param2);
if (!i.empty())
return MutationRef( (MutationRef::Type)m.type, i.begin, i.end );
}
else
ASSERT(false);
return Optional<MutationRef>();
}
bool expandMutation( MutationRef& m, StorageServer::VersionedData const& data, UpdateEagerReadInfo* eager, KeyRef eagerTrustedEnd, Arena& ar ) {
// After this function call, m should be copied into an arena immediately (before modifying data, shards, or eager)
if (m.type == MutationRef::ClearRange) {
// Expand the clear
const auto& d = data.atLatest();
// If another clear overlaps the beginning of this one, engulf it
auto i = d.lastLess(m.param1);
if (i && i->isClearTo() && i->getEndKey() >= m.param1)
m.param1 = i.key();
// If another clear overlaps the end of this one, engulf it; otherwise expand
i = d.lastLessOrEqual(m.param2);
if (i && i->isClearTo() && i->getEndKey() >= m.param2) {
m.param2 = i->getEndKey();
} else {
// Expand to the next set or clear (from storage or latestVersion), and if it
// is a clear, engulf it as well
i = d.lower_bound(m.param2);
KeyRef endKeyAtStorageVersion = m.param2 == eagerTrustedEnd ? eagerTrustedEnd : std::min( eager->getKeyEnd( m.param2 ), eagerTrustedEnd );
if (!i || endKeyAtStorageVersion < i.key())
m.param2 = endKeyAtStorageVersion;
else if (i->isClearTo())
m.param2 = i->getEndKey();
else
m.param2 = i.key();
}
}
else if (m.type != MutationRef::SetValue && (m.type)) {
Optional<StringRef> oldVal;
auto it = data.atLatest().lastLessOrEqual(m.param1);
if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1)
oldVal = it->getValue();
else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) {
TEST(true); // Atomic op right after a clear.
}
else {
Optional<Value>& oldThing = eager->getValue(m.param1);
if (oldThing.present())
oldVal = oldThing.get();
}
switch(m.type) {
case MutationRef::AddValue:
m.param2 = doLittleEndianAdd(oldVal, m.param2, ar);
break;
case MutationRef::And:
m.param2 = doAnd(oldVal, m.param2, ar);
break;
case MutationRef::Or:
m.param2 = doOr(oldVal, m.param2, ar);
break;
case MutationRef::Xor:
m.param2 = doXor(oldVal, m.param2, ar);
break;
case MutationRef::AppendIfFits:
m.param2 = doAppendIfFits(oldVal, m.param2, ar);
break;
case MutationRef::Max:
m.param2 = doMax(oldVal, m.param2, ar);
break;
case MutationRef::Min:
m.param2 = doMin(oldVal, m.param2, ar);
break;
case MutationRef::ByteMin:
m.param2 = doByteMin(oldVal, m.param2, ar);
break;
case MutationRef::ByteMax:
m.param2 = doByteMax(oldVal, m.param2, ar);
break;
case MutationRef::MinV2:
m.param2 = doMinV2(oldVal, m.param2, ar);
break;
case MutationRef::AndV2:
m.param2 = doAndV2(oldVal, m.param2, ar);
break;
case MutationRef::CompareAndClear:
if (oldVal.present() && m.param2 == oldVal.get()) {
m.type = MutationRef::ClearRange;
m.param2 = keyAfter(m.param1, ar);
return expandMutation(m, data, eager, eagerTrustedEnd, ar);
}
return false;
}
m.type = MutationRef::SetValue;
}
return true;
}
void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, StorageServer::VersionedData &data ) {
// m is expected to be in arena already
// Clear split keys are added to arena
StorageMetrics metrics;
metrics.bytesPerKSecond = mvccStorageBytes( m ) / 2;
metrics.iosPerKSecond = 1;
self->metrics.notify(m.param1, metrics);
if (m.type == MutationRef::SetValue) {
auto prev = data.atLatest().lastLessOrEqual(m.param1);
if (prev && prev->isClearTo() && prev->getEndKey() > m.param1) {
ASSERT( prev.key() <= m.param1 );
KeyRef end = prev->getEndKey();
// the insert version of the previous clear is preserved for the "left half", because in changeDurableVersion() the previous clear is still responsible for removing it
// insert() invalidates prev, so prev.key() is not safe to pass to it by reference
data.insert( KeyRef(prev.key()), ValueOrClearToRef::clearTo( m.param1 ), prev.insertVersion() ); // overwritten by below insert if empty
KeyRef nextKey = keyAfter(m.param1, arena);
if ( end != nextKey ) {
ASSERT( end > nextKey );
// the insert version of the "right half" is not preserved, because in changeDurableVersion() this set is responsible for removing it
// FIXME: This copy is technically an asymptotic problem, definitely a waste of memory (copy of keyAfter is a waste, but not asymptotic)
data.insert( nextKey, ValueOrClearToRef::clearTo( KeyRef(arena, end) ) );
}
}
data.insert( m.param1, ValueOrClearToRef::value(m.param2) );
self->watches.trigger( m.param1 );
} else if (m.type == MutationRef::ClearRange) {
data.erase( m.param1, m.param2 );
ASSERT( m.param2 > m.param1 );
ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) );
data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) );
self->watches.triggerRange( m.param1, m.param2 );
}
}
void removeDataRange( StorageServer *ss, Standalone<VersionUpdateRef> &mLV, KeyRangeMap<Reference<ShardInfo>>& shards, KeyRangeRef range ) {
// modify the latest version of data to remove all sets and trim all clears to exclude range.
// Add a clear to mLV (mutationLog[data.getLatestVersion()]) that ensures all keys in range are removed from the disk when this latest version becomes durable
// mLV is also modified if necessary to ensure that split clears can be forgotten
MutationRef clearRange( MutationRef::ClearRange, range.begin, range.end );
clearRange = ss->addMutationToMutationLog( mLV, clearRange );
auto& data = ss->mutableData();
// Expand the range to the right to include other shards not in versionedData
for( auto r = shards.rangeContaining(range.end); r != shards.ranges().end() && !r->value()->isInVersionedData(); ++r )
range = KeyRangeRef(range.begin, r->end());
auto endClear = data.atLatest().lastLess( range.end );
if (endClear && endClear->isClearTo() && endClear->getEndKey() > range.end ) {
// This clear has been bumped up to insertVersion==data.getLatestVersion and needs a corresponding mutation log entry to forget
MutationRef m( MutationRef::ClearRange, range.end, endClear->getEndKey() );
m = ss->addMutationToMutationLog( mLV, m );
data.insert( m.param1, ValueOrClearToRef::clearTo( m.param2 ) );
}
auto beginClear = data.atLatest().lastLess( range.begin );
if (beginClear && beginClear->isClearTo() && beginClear->getEndKey() > range.begin ) {
// We don't need any special mutationLog entry - because the begin key and insert version are unchanged the original clear
// mutation works to forget this one - but we need range.begin in the right arena
KeyRef rb( mLV.arena(), range.begin );
// insert() invalidates beginClear, so beginClear.key() is not safe to pass to it by reference
data.insert( KeyRef(beginClear.key()), ValueOrClearToRef::clearTo( rb ), beginClear.insertVersion() );
}
data.erase( range.begin, range.end );
}
void setAvailableStatus( StorageServer* self, KeyRangeRef keys, bool available );
void setAssignedStatus( StorageServer* self, KeyRangeRef keys, bool nowAssigned );
void coalesceShards(StorageServer *data, KeyRangeRef keys) {
auto shardRanges = data->shards.intersectingRanges(keys);
auto fullRange = data->shards.ranges();
auto iter = shardRanges.begin();
if( iter != fullRange.begin() ) --iter;
auto iterEnd = shardRanges.end();
if( iterEnd != fullRange.end() ) ++iterEnd;
bool lastReadable = false;
bool lastNotAssigned = false;
KeyRangeMap<Reference<ShardInfo>>::iterator lastRange;
for( ; iter != iterEnd; ++iter) {
if( lastReadable && iter->value()->isReadable() ) {
KeyRange range = KeyRangeRef( lastRange->begin(), iter->end() );
data->addShard( ShardInfo::newReadWrite( range, data) );
iter = data->shards.rangeContaining(range.begin);
} else if( lastNotAssigned && iter->value()->notAssigned() ) {
KeyRange range = KeyRangeRef( lastRange->begin(), iter->end() );
data->addShard( ShardInfo::newNotAssigned( range) );
iter = data->shards.rangeContaining(range.begin);
}
lastReadable = iter->value()->isReadable();
lastNotAssigned = iter->value()->notAssigned();
lastRange = iter;
}
}
ACTOR Future<Standalone<RangeResultRef>> tryGetRange( Database cx, Version version, KeyRangeRef keys, GetRangeLimits limits, bool* isTooOld ) {
state Transaction tr( cx );
state Standalone<RangeResultRef> output;
state KeySelectorRef begin = firstGreaterOrEqual( keys.begin );
state KeySelectorRef end = firstGreaterOrEqual( keys.end );
if( *isTooOld )
throw transaction_too_old();
ASSERT(!cx->switchable);
tr.setVersion( version );
tr.info.taskID = TaskPriority::FetchKeys;
limits.minRows = 0;
try {
loop {
Standalone<RangeResultRef> rep = wait( tr.getRange( begin, end, limits, true ) );
limits.decrement( rep );
if( limits.isReached() || !rep.more ) {
if( output.size() ) {
output.arena().dependsOn( rep.arena() );
output.append( output.arena(), rep.begin(), rep.size() );
if( limits.isReached() && rep.readThrough.present() )
output.readThrough = rep.readThrough.get();
} else {
output = rep;
}
output.more = limits.isReached();
return output;
} else if( rep.readThrough.present() ) {
output.arena().dependsOn( rep.arena() );
if( rep.size() ) {
output.append( output.arena(), rep.begin(), rep.size() );
ASSERT( rep.readThrough.get() > rep.end()[-1].key );
} else {
ASSERT( rep.readThrough.get() > keys.begin );
}
begin = firstGreaterOrEqual( rep.readThrough.get() );
} else {
output.arena().dependsOn( rep.arena() );
output.append( output.arena(), rep.begin(), rep.size() );
begin = firstGreaterThan( output.end()[-1].key );
}
}
} catch( Error &e ) {
if( begin.getKey() != keys.begin && ( e.code() == error_code_transaction_too_old || e.code() == error_code_future_version || e.code() == error_code_process_behind ) ) {
if( e.code() == error_code_transaction_too_old )
*isTooOld = true;
output.more = true;
if( begin.isFirstGreaterOrEqual() )
output.readThrough = begin.getKey();
return output;
}
throw;
}
}
template <class T>
void addMutation( T& target, Version version, MutationRef const& mutation ) {
target.addMutation( version, mutation );
}
template <class T>
void addMutation( Reference<T>& target, Version version, MutationRef const& mutation ) {
addMutation(*target, version, mutation);
}
template <class T>
void splitMutations(StorageServer* data, KeyRangeMap<T>& map, VerUpdateRef const& update) {
for(int i = 0; i < update.mutations.size(); i++) {
splitMutation(data, map, update.mutations[i], update.version);
}
}
template <class T>
void splitMutation(StorageServer* data, KeyRangeMap<T>& map, MutationRef const& m, Version ver) {
if(isSingleKeyMutation((MutationRef::Type) m.type)) {
if ( !SHORT_CIRCUT_ACTUAL_STORAGE || !normalKeys.contains(m.param1) )
addMutation( map.rangeContaining(m.param1)->value(), ver, m );
}
else if (m.type == MutationRef::ClearRange) {
KeyRangeRef mKeys( m.param1, m.param2 );
if ( !SHORT_CIRCUT_ACTUAL_STORAGE || !normalKeys.contains(mKeys) ){
auto r = map.intersectingRanges( mKeys );
for(auto i = r.begin(); i != r.end(); ++i) {
KeyRangeRef k = mKeys & i->range();
addMutation( i->value(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) );
}
}
} else
ASSERT(false); // Unknown mutation type in splitMutations
}
ACTOR Future<Void> logFetchKeysWarning(AddingShard* shard) {
state double startTime = now();
loop {
state double waitSeconds = BUGGIFY ? 5.0 : 600.0;
wait(delay(waitSeconds));
TraceEvent(waitSeconds > 300.0 ? SevWarnAlways : SevInfo, "FetchKeysTooLong").detail("Duration", now() - startTime).detail("Phase", shard->phase).detail("Begin", shard->keys.begin.printable()).detail("End", shard->keys.end.printable());
}
}
ACTOR Future<Void> fetchKeys( StorageServer *data, AddingShard* shard ) {
state TraceInterval interval("FetchKeys");
state KeyRange keys = shard->keys;
state Future<Void> warningLogger = logFetchKeysWarning(shard);
state double startt = now();
state int fetchBlockBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_BLOCK_BYTES : SERVER_KNOBS->FETCH_BLOCK_BYTES;
// delay(0) to force a return to the run loop before the work of fetchKeys is started.
// This allows adding->start() to be called inline with CSK.
wait( data->coreStarted.getFuture() && delay( 0 ) );
try {
DEBUG_KEY_RANGE("fetchKeysBegin", data->version.get(), shard->keys);
TraceEvent(SevDebug, interval.begin(), data->thisServerID)
.detail("KeyBegin", shard->keys.begin)
.detail("KeyEnd",shard->keys.end);
validate(data);
// Wait (if necessary) for the latest version at which any key in keys was previously available (+1) to be durable
auto navr = data->newestAvailableVersion.intersectingRanges( keys );
Version lastAvailable = invalidVersion;
for(auto r=navr.begin(); r!=navr.end(); ++r) {
ASSERT( r->value() != latestVersion );
lastAvailable = std::max(lastAvailable, r->value());
}
auto ndvr = data->newestDirtyVersion.intersectingRanges( keys );
for(auto r=ndvr.begin(); r!=ndvr.end(); ++r)
lastAvailable = std::max(lastAvailable, r->value());
if (lastAvailable != invalidVersion && lastAvailable >= data->durableVersion.get()) {
TEST(true); // FetchKeys waits for previous available version to be durable
wait( data->durableVersion.whenAtLeast(lastAvailable+1) );
}
TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID);
wait( data->fetchKeysParallelismLock.take( TaskPriority::DefaultYield, fetchBlockBytes ) );
state FlowLock::Releaser holdingFKPL( data->fetchKeysParallelismLock, fetchBlockBytes );
state double executeStart = now();
++data->counters.fetchWaitingCount;
data->counters.fetchWaitingMS += 1000*(executeStart - startt);
// Fetch keys gets called while the update actor is processing mutations. data->version will not be updated until all mutations for a version
// have been processed. We need to take the durableVersionLock to ensure data->version is greater than the version of the mutation which caused
// the fetch to be initiated.
wait( data->durableVersionLock.take() );
shard->phase = AddingShard::Fetching;
state Version fetchVersion = data->version.get();
data->durableVersionLock.release();
wait(delay(0));
TraceEvent(SevDebug, "FetchKeysUnblocked", data->thisServerID).detail("FKID", interval.pairID).detail("Version", fetchVersion);
// Get the history
state int debug_getRangeRetries = 0;
state int debug_nextRetryToLog = 1;
state bool isTooOld = false;
//FIXME: The client cache does not notice when servers are added to a team. To read from a local storage server we must refresh the cache manually.
data->cx->invalidateCache(keys);
loop {
try {
TEST(true); // Fetching keys for transferred shard
state Standalone<RangeResultRef> this_block =
wait(tryGetRange(data->cx, fetchVersion, keys,
GetRangeLimits(GetRangeLimits::ROW_LIMIT_UNLIMITED, fetchBlockBytes), &isTooOld));
int expectedSize = (int)this_block.expectedSize() + (8-(int)sizeof(KeyValueRef))*this_block.size();
TraceEvent(SevDebug, "FetchKeysBlock", data->thisServerID).detail("FKID", interval.pairID)
.detail("BlockRows", this_block.size()).detail("BlockBytes", expectedSize)
.detail("KeyBegin", keys.begin).detail("KeyEnd", keys.end)
.detail("Last", this_block.size() ? this_block.end()[-1].key : std::string())
.detail("Version", fetchVersion).detail("More", this_block.more);
DEBUG_KEY_RANGE("fetchRange", fetchVersion, keys);
for(auto k = this_block.begin(); k != this_block.end(); ++k) DEBUG_MUTATION("fetch", fetchVersion, MutationRef(MutationRef::SetValue, k->key, k->value));
data->counters.bytesFetched += expectedSize;
if( fetchBlockBytes > expectedSize ) {
holdingFKPL.release( fetchBlockBytes - expectedSize );
}
// Wait for permission to proceed
//wait( data->fetchKeysStorageWriteLock.take() );
//state FlowLock::Releaser holdingFKSWL( data->fetchKeysStorageWriteLock );
// Write this_block to storage
state KeyValueRef *kvItr = this_block.begin();
for(; kvItr != this_block.end(); ++kvItr) {
data->storage.writeKeyValue( *kvItr );
wait(yield());
}
kvItr = this_block.begin();
for(; kvItr != this_block.end(); ++kvItr) {
data->byteSampleApplySet( *kvItr, invalidVersion );
wait(yield());
}
if (this_block.more) {
Key nfk = this_block.readThrough.present() ? this_block.readThrough.get() : keyAfter( this_block.end()[-1].key );
if (nfk != keys.end) {
std::deque< Standalone<VerUpdateRef> > updatesToSplit = std::move( shard->updates );
// This actor finishes committing the keys [keys.begin,nfk) that we already fetched.
// The remaining unfetched keys [nfk,keys.end) will become a separate AddingShard with its own fetchKeys.
shard->server->addShard( ShardInfo::addingSplitLeft( KeyRangeRef(keys.begin, nfk), shard ) );
shard->server->addShard( ShardInfo::newAdding( data, KeyRangeRef(nfk, keys.end) ) );
shard = data->shards.rangeContaining( keys.begin ).value()->adding;
warningLogger = logFetchKeysWarning(shard);
AddingShard* otherShard = data->shards.rangeContaining( nfk ).value()->adding;
keys = shard->keys;
// Split our prior updates. The ones that apply to our new, restricted key range will go back into shard->updates,
// and the ones delivered to the new shard will be discarded because it is in WaitPrevious phase (hasn't chosen a fetchVersion yet).
// What we are doing here is expensive and could get more expensive if we started having many more blocks per shard. May need optimization in the future.
std::deque< Standalone<VerUpdateRef> >::iterator u = updatesToSplit.begin();
for(; u != updatesToSplit.end(); ++u) {
splitMutations(data, data->shards, *u);
}
TEST( true );
TEST( shard->updates.size() );
ASSERT( otherShard->updates.empty() );
}
}
this_block = Standalone<RangeResultRef>();
if (BUGGIFY) wait( delay( 1 ) );
break;
} catch (Error& e) {
TraceEvent("FKBlockFail", data->thisServerID).error(e,true).suppressFor(1.0).detail("FKID", interval.pairID);
if (e.code() == error_code_transaction_too_old){
TEST(true); // A storage server has forgotten the history data we are fetching
Version lastFV = fetchVersion;
fetchVersion = data->version.get();
isTooOld = false;
// Throw away deferred updates from before fetchVersion, since we don't need them to use blocks fetched at that version
while (!shard->updates.empty() && shard->updates[0].version <= fetchVersion) shard->updates.pop_front();
//FIXME: remove when we no longer support upgrades from 5.X
if(debug_getRangeRetries >= 100) {
data->cx->enableLocalityLoadBalance = false;
}
debug_getRangeRetries++;
if (debug_nextRetryToLog==debug_getRangeRetries){
debug_nextRetryToLog += std::min(debug_nextRetryToLog, 1024);
TraceEvent(SevWarn, "FetchPast", data->thisServerID).detail("TotalAttempts", debug_getRangeRetries).detail("FKID", interval.pairID).detail("V", lastFV).detail("N", fetchVersion).detail("E", data->version.get());
}
} else if (e.code() == error_code_future_version || e.code() == error_code_process_behind) {
TEST(true); // fetchKeys got future_version or process_behind, so there must be a huge storage lag somewhere. Keep trying.
} else {
throw;
}
wait( delayJittered( FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY ) );
}
}
//FIXME: remove when we no longer support upgrades from 5.X
data->cx->enableLocalityLoadBalance = true;
// We have completed the fetch and write of the data, now we wait for MVCC window to pass.
// As we have finished this work, we will allow more work to start...
shard->fetchComplete.send(Void());
TraceEvent(SevDebug, "FKBeforeFinalCommit", data->thisServerID).detail("FKID", interval.pairID).detail("SV", data->storageVersion()).detail("DV", data->durableVersion.get());
// Directly commit()ing the IKVS would interfere with updateStorage, possibly resulting in an incomplete version being recovered.
// Instead we wait for the updateStorage loop to commit something (and consequently also what we have written)
wait( data->durableVersion.whenAtLeast( data->storageVersion()+1 ) );
holdingFKPL.release();
TraceEvent(SevDebug, "FKAfterFinalCommit", data->thisServerID).detail("FKID", interval.pairID).detail("SV", data->storageVersion()).detail("DV", data->durableVersion.get());
// Wait to run during update(), after a new batch of versions is received from the tlog but before eager reads take place.
Promise<FetchInjectionInfo*> p;
data->readyFetchKeys.push_back( p );
FetchInjectionInfo* batch = wait( p.getFuture() );
TraceEvent(SevDebug, "FKUpdateBatch", data->thisServerID).detail("FKID", interval.pairID);
shard->phase = AddingShard::Waiting;
// Choose a transferredVersion. This choice and timing ensure that
// * The transferredVersion can be mutated in versionedData
// * The transferredVersion isn't yet committed to storage (so we can write the availability status change)
// * The transferredVersion is <= the version of any of the updates in batch, and if there is an equal version
// its mutations haven't been processed yet
shard->transferredVersion = data->version.get() + 1;
//shard->transferredVersion = batch->changes[0].version; //< FIXME: This obeys the documented properties, and seems "safer" because it never introduces extra versions into the data structure, but violates some ASSERTs currently
data->mutableData().createNewVersion( shard->transferredVersion );
ASSERT( shard->transferredVersion > data->storageVersion() );
ASSERT( shard->transferredVersion == data->data().getLatestVersion() );
TraceEvent(SevDebug, "FetchKeysHaveData", data->thisServerID).detail("FKID", interval.pairID)
.detail("Version", shard->transferredVersion).detail("StorageVersion", data->storageVersion());
validate(data);
// Put the updates that were collected during the FinalCommit phase into the batch at the transferredVersion. Eager reads will be done
// for them by update(), and the mutations will come back through AddingShard::addMutations and be applied to versionedMap and mutationLog as normal.
// The lie about their version is acceptable because this shard will never be read at versions < transferredVersion
for(auto i=shard->updates.begin(); i!=shard->updates.end(); ++i) {
i->version = shard->transferredVersion;
batch->arena.dependsOn(i->arena());
}
int startSize = batch->changes.size();
TEST(startSize); //Adding fetch data to a batch which already has changes
batch->changes.resize( batch->changes.size()+shard->updates.size() );
//FIXME: pass the deque back rather than copy the data
std::copy( shard->updates.begin(), shard->updates.end(), batch->changes.begin()+startSize );
Version checkv = shard->transferredVersion;
for(auto b = batch->changes.begin()+startSize; b != batch->changes.end(); ++b ) {
ASSERT( b->version >= checkv );
checkv = b->version;
for(auto& m : b->mutations)
DEBUG_MUTATION("fetchKeysFinalCommitInject", batch->changes[0].version, m);
}
shard->updates.clear();
setAvailableStatus(data, keys, true); // keys will be available when getLatestVersion()==transferredVersion is durable
// Wait for the transferredVersion (and therefore the shard data) to be committed and durable.
wait( data->durableVersion.whenAtLeast( shard->transferredVersion ) );
ASSERT( data->shards[shard->keys.begin]->assigned() && data->shards[shard->keys.begin]->keys == shard->keys ); // We aren't changing whether the shard is assigned
data->newestAvailableVersion.insert(shard->keys, latestVersion);
shard->readWrite.send(Void());
data->addShard( ShardInfo::newReadWrite(shard->keys, data) ); // invalidates shard!
coalesceShards(data, keys);
validate(data);
++data->counters.fetchExecutingCount;
data->counters.fetchExecutingMS += 1000*(now() - executeStart);
TraceEvent(SevDebug, interval.end(), data->thisServerID);
} catch (Error &e){
TraceEvent(SevDebug, interval.end(), data->thisServerID).error(e, true).detail("Version", data->version.get());
if (e.code() == error_code_actor_cancelled && !data->shuttingDown && shard->phase >= AddingShard::Fetching) {
if (shard->phase < AddingShard::Waiting) {
data->storage.clearRange( keys );
data->byteSampleApplyClear( keys, invalidVersion );
} else {
ASSERT( data->data().getLatestVersion() > data->version.get() );
removeDataRange( data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, keys );
setAvailableStatus(data, keys, false);
// Prevent another, overlapping fetchKeys from entering the Fetching phase until data->data().getLatestVersion() is durable
data->newestDirtyVersion.insert( keys, data->data().getLatestVersion() );
}
}
TraceEvent(SevError, "FetchKeysError", data->thisServerID)
.error(e)
.detail("Elapsed", now()-startt)
.detail("KeyBegin", keys.begin)
.detail("KeyEnd",keys.end);
if (e.code() != error_code_actor_cancelled)
data->otherError.sendError(e); // Kill the storage server. Are there any recoverable errors?
throw; // goes nowhere
}
return Void();
};
AddingShard::AddingShard( StorageServer* server, KeyRangeRef const& keys )
: server(server), keys(keys), transferredVersion(invalidVersion), phase(WaitPrevious)
{
fetchClient = fetchKeys(server, this);
}
void AddingShard::addMutation( Version version, MutationRef const& mutation ){
if (mutation.type == mutation.ClearRange) {
ASSERT( keys.begin<=mutation.param1 && mutation.param2<=keys.end );
}
else if (isSingleKeyMutation((MutationRef::Type) mutation.type)) {
ASSERT( keys.contains(mutation.param1) );
}
if (phase == WaitPrevious) {
// Updates can be discarded
} else if (phase == Fetching) {
if (!updates.size() || version > updates.end()[-1].version) {
VerUpdateRef v;
v.version = version;
v.isPrivateData = false;
updates.push_back(v);
} else {
ASSERT( version == updates.end()[-1].version );
}
updates.back().mutations.push_back_deep( updates.back().arena(), mutation );
} else if (phase == Waiting) {
server->addMutation(version, mutation, keys, server->updateEagerReads);
} else ASSERT(false);
}
void ShardInfo::addMutation(Version version, MutationRef const& mutation) {
ASSERT( (void *)this);
ASSERT( keys.contains( mutation.param1 ) );
if (adding)
adding->addMutation(version, mutation);
else if (readWrite)
readWrite->addMutation(version, mutation, this->keys, readWrite->updateEagerReads);
else if (mutation.type != MutationRef::ClearRange) {
TraceEvent(SevError, "DeliveredToNotAssigned").detail("Version", version).detail("Mutation", mutation.toString());
ASSERT(false); // Mutation delivered to notAssigned shard!
}
}
enum ChangeServerKeysContext { CSK_UPDATE, CSK_RESTORE };
const char* changeServerKeysContextName[] = { "Update", "Restore" };
void changeServerKeys( StorageServer* data, const KeyRangeRef& keys, bool nowAssigned, Version version, ChangeServerKeysContext context ) {
ASSERT( !keys.empty() );
//TraceEvent("ChangeServerKeys", data->thisServerID)
// .detail("KeyBegin", keys.begin)
// .detail("KeyEnd", keys.end)
// .detail("NowAssigned", nowAssigned)
// .detail("Version", version)
// .detail("Context", changeServerKeysContextName[(int)context]);
validate(data);
// TODO(alexmiller): Figure out how to selectively enable spammy data distribution events.
//DEBUG_KEY_RANGE( nowAssigned ? "KeysAssigned" : "KeysUnassigned", version, keys );
bool isDifferent = false;
auto existingShards = data->shards.intersectingRanges(keys);
for( auto it = existingShards.begin(); it != existingShards.end(); ++it ) {
if( nowAssigned != it->value()->assigned() ) {
isDifferent = true;
/*TraceEvent("CSKRangeDifferent", data->thisServerID)
.detail("KeyBegin", it->range().begin)
.detail("KeyEnd", it->range().end);*/
break;
}
}
if( !isDifferent ) {
//TraceEvent("CSKShortCircuit", data->thisServerID)
// .detail("KeyBegin", keys.begin)
// .detail("KeyEnd", keys.end);
return;
}
// Save a backup of the ShardInfo references before we start messing with shards, in order to defer fetchKeys cancellation (and
// its potential call to removeDataRange()) until shards is again valid
vector< Reference<ShardInfo> > oldShards;
auto os = data->shards.intersectingRanges(keys);
for(auto r = os.begin(); r != os.end(); ++r)
oldShards.push_back( r->value() );
// As addShard (called below)'s documentation requires, reinitialize any overlapping range(s)
auto ranges = data->shards.getAffectedRangesAfterInsertion( keys, Reference<ShardInfo>() ); // null reference indicates the range being changed
for(int i=0; i<ranges.size(); i++) {
if (!ranges[i].value) {
ASSERT( (KeyRangeRef&)ranges[i] == keys ); // there shouldn't be any nulls except for the range being inserted
} else if (ranges[i].value->notAssigned())
data->addShard( ShardInfo::newNotAssigned(ranges[i]) );
else if (ranges[i].value->isReadable())
data->addShard( ShardInfo::newReadWrite(ranges[i], data) );
else {
ASSERT( ranges[i].value->adding );
data->addShard( ShardInfo::newAdding( data, ranges[i] ) );
TEST( true ); // ChangeServerKeys reFetchKeys
}
}
// Shard state depends on nowAssigned and whether the data is available (actually assigned in memory or on the disk) up to the given
// version. The latter depends on data->newestAvailableVersion, so loop over the ranges of that.
// SOMEDAY: Could this just use shards? Then we could explicitly do the removeDataRange here when an adding/transferred shard is cancelled
auto vr = data->newestAvailableVersion.intersectingRanges(keys);
std::vector<std::pair<KeyRange,Version>> changeNewestAvailable;
std::vector<KeyRange> removeRanges;
for (auto r = vr.begin(); r != vr.end(); ++r) {
KeyRangeRef range = keys & r->range();
bool dataAvailable = r->value()==latestVersion || r->value() >= version;
/*TraceEvent("CSKRange", data->thisServerID)
.detail("KeyBegin", range.begin)
.detail("KeyEnd", range.end)
.detail("Available", dataAvailable)
.detail("NowAssigned", nowAssigned)
.detail("NewestAvailable", r->value())
.detail("ShardState0", data->shards[range.begin]->debugDescribeState());*/
if (!nowAssigned) {
if (dataAvailable) {
ASSERT( r->value() == latestVersion); // Not that we care, but this used to be checked instead of dataAvailable
ASSERT( data->mutableData().getLatestVersion() > version || context == CSK_RESTORE );
changeNewestAvailable.emplace_back(range, version);
removeRanges.push_back( range );
}
data->addShard( ShardInfo::newNotAssigned(range) );
data->watches.triggerRange( range.begin, range.end );
} else if (!dataAvailable) {
// SOMEDAY: Avoid restarting adding/transferred shards
if (version==0){ // bypass fetchkeys; shard is known empty at version 0
changeNewestAvailable.emplace_back(range, latestVersion);
data->addShard( ShardInfo::newReadWrite(range, data) );
setAvailableStatus(data, range, true);
} else {
auto& shard = data->shards[range.begin];
if( !shard->assigned() || shard->keys != range )
data->addShard( ShardInfo::newAdding(data, range) );
}
} else {
changeNewestAvailable.emplace_back(range, latestVersion);
data->addShard( ShardInfo::newReadWrite(range, data) );
}
}
// Update newestAvailableVersion when a shard becomes (un)available (in a separate loop to avoid invalidating vr above)
for(auto r = changeNewestAvailable.begin(); r != changeNewestAvailable.end(); ++r)
data->newestAvailableVersion.insert( r->first, r->second );
if (!nowAssigned)
data->metrics.notifyNotReadable( keys );
coalesceShards( data, KeyRangeRef(ranges[0].begin, ranges[ranges.size()-1].end) );
// Now it is OK to do removeDataRanges, directly and through fetchKeys cancellation (and we have to do so before validate())
oldShards.clear();
ranges.clear();
for(auto r=removeRanges.begin(); r!=removeRanges.end(); ++r) {
removeDataRange( data, data->addVersionToMutationLog(data->data().getLatestVersion()), data->shards, *r );
setAvailableStatus(data, *r, false);
}
validate(data);
}
void rollback( StorageServer* data, Version rollbackVersion, Version nextVersion ) {
TEST(true); // call to shard rollback
DEBUG_KEY_RANGE("Rollback", rollbackVersion, allKeys);
// We used to do a complicated dance to roll back in MVCC history. It's much simpler, and more testable,
// to simply restart the storage server actor and restore from the persistent disk state, and then roll
// forward from the TLog's history. It's not quite as efficient, but we rarely have to do this in practice.
// FIXME: This code is relying for liveness on an undocumented property of the log system implementation: that after a rollback the rolled back versions will
// eventually be missing from the peeked log. A more sophisticated approach would be to make the rollback range durable and, after reboot, skip over
// those versions if they appear in peek results.
throw please_reboot();
}
void StorageServer::addMutation(Version version, MutationRef const& mutation, KeyRangeRef const& shard, UpdateEagerReadInfo* eagerReads ) {
MutationRef expanded = mutation;
auto& mLog = addVersionToMutationLog(version);
if ( !expandMutation( expanded, data(), eagerReads, shard.end, mLog.arena()) ) {
return;
}
expanded = addMutationToMutationLog(mLog, expanded);
DEBUG_MUTATION("applyMutation", version, expanded).detail("UID", thisServerID).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end);
applyMutation( this, expanded, mLog.arena(), mutableData() );
//printf("\nSSUpdate: Printing versioned tree after applying mutation\n");
//mutableData().printTree(version);
}
struct OrderByVersion {
bool operator()( const VersionUpdateRef& a, const VersionUpdateRef& b ) {
if (a.version != b.version) return a.version < b.version;
if (a.isPrivateData != b.isPrivateData) return a.isPrivateData;
return false;
}
};
#define PERSIST_PREFIX "\xff\xff"
// Immutable
static const KeyValueRef persistFormat( LiteralStringRef( PERSIST_PREFIX "Format" ), LiteralStringRef("FoundationDB/StorageServer/1/4") );
static const KeyRangeRef persistFormatReadableRange( LiteralStringRef("FoundationDB/StorageServer/1/2"), LiteralStringRef("FoundationDB/StorageServer/1/5") );
static const KeyRef persistID = LiteralStringRef( PERSIST_PREFIX "ID" );
// (Potentially) change with the durable version or when fetchKeys completes
static const KeyRef persistVersion = LiteralStringRef( PERSIST_PREFIX "Version" );
static const KeyRangeRef persistShardAssignedKeys = KeyRangeRef( LiteralStringRef( PERSIST_PREFIX "ShardAssigned/" ), LiteralStringRef( PERSIST_PREFIX "ShardAssigned0" ) );
static const KeyRangeRef persistShardAvailableKeys = KeyRangeRef( LiteralStringRef( PERSIST_PREFIX "ShardAvailable/" ), LiteralStringRef( PERSIST_PREFIX "ShardAvailable0" ) );
static const KeyRangeRef persistByteSampleKeys = KeyRangeRef( LiteralStringRef( PERSIST_PREFIX "BS/" ), LiteralStringRef( PERSIST_PREFIX "BS0" ) );
static const KeyRangeRef persistByteSampleSampleKeys = KeyRangeRef( LiteralStringRef( PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS/" ), LiteralStringRef( PERSIST_PREFIX "BS/" PERSIST_PREFIX "BS0" ) );
static const KeyRef persistLogProtocol = LiteralStringRef(PERSIST_PREFIX "LogProtocol");
static const KeyRef persistPrimaryLocality = LiteralStringRef( PERSIST_PREFIX "PrimaryLocality" );
// data keys are unmangled (but never start with PERSIST_PREFIX because they are always in allKeys)
class StorageUpdater {
public:
StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false), processedCacheStartKey(false) {}
StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false), processedCacheStartKey(false) {}
void applyMutation(StorageServer* data, MutationRef const& m, Version ver) {
//TraceEvent("SSNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver);
if(currentVersion != ver) {
fromVersion = currentVersion;
currentVersion = ver;
data->mutableData().createNewVersion(ver);
}
if (m.param1.startsWith( systemKeys.end )) {
if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix))
applyPrivateCacheData( data, m);
else {
applyPrivateData( data, m );
}
} else {
// FIXME: enable when DEBUG_MUTATION is active
//for(auto m = changes[c].mutations.begin(); m; ++m) {
// DEBUG_MUTATION("SSUpdateMutation", changes[c].version, *m);
//}
splitMutation(data, data->shards, m, ver);
}
if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get();
}
Version currentVersion;
private:
Version fromVersion;
Version restoredVersion;
KeyRef startKey;
bool nowAssigned;
bool processedStartKey;
KeyRef cacheStartKey;
bool processedCacheStartKey;
void applyPrivateData( StorageServer* data, MutationRef const& m ) {
TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m.toString());
if (processedStartKey) {
// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
// We can also ignore clearRanges, because they are always accompanied by such a pair of sets with the same keys
ASSERT (m.type == MutationRef::SetValue && m.param1.startsWith(data->sk));
KeyRangeRef keys( startKey.removePrefix( data->sk ), m.param1.removePrefix( data->sk ));
// add changes in shard assignment to the mutation log
setAssignedStatus( data, keys, nowAssigned );
// The changes for version have already been received (and are being processed now). We need
// to fetch the data for change.version-1 (changes from versions < change.version)
changeServerKeys( data, keys, nowAssigned, currentVersion-1, CSK_UPDATE );
processedStartKey = false;
} else if (m.type == MutationRef::SetValue && m.param1.startsWith( data->sk )) {
// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
// We can also ignore clearRanges, because they are always accompanied by such a pair of sets with the same keys
startKey = m.param1;
nowAssigned = m.param2 != serverKeysFalse;
processedStartKey = true;
} else if (m.type == MutationRef::SetValue && m.param1 == lastEpochEndPrivateKey) {
// lastEpochEnd transactions are guaranteed by the master to be alone in their own batch (version)
// That means we don't have to worry about the impact on changeServerKeys
//ASSERT( /*isFirstVersionUpdateFromTLog && */!std::next(it) );
Version rollbackVersion;
BinaryReader br(m.param2, Unversioned());
br >> rollbackVersion;
if ( rollbackVersion < fromVersion && rollbackVersion > restoredVersion ) {
TEST( true ); // ShardApplyPrivateData shard rollback
TraceEvent(SevWarn, "Rollback", data->thisServerID)
.detail("FromVersion", fromVersion)
.detail("ToVersion", rollbackVersion)
.detail("AtVersion", currentVersion)
.detail("StorageVersion", data->storageVersion());
ASSERT( rollbackVersion >= data->storageVersion() );
rollback( data, rollbackVersion, currentVersion );
}
data->recoveryVersionSkips.emplace_back(rollbackVersion, currentVersion - rollbackVersion);
} else if (m.type == MutationRef::SetValue && m.param1 == killStoragePrivateKey) {
throw worker_removed();
} else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) && m.param1.substr(1).startsWith(serverTagPrefix)) {
bool matchesThisServer = decodeServerTagKey(m.param1.substr(1)) == data->thisServerID;
if( (m.type == MutationRef::SetValue && !matchesThisServer) || (m.type == MutationRef::ClearRange && matchesThisServer) )
throw worker_removed();
} else if (m.type == MutationRef::SetValue && m.param1 == rebootWhenDurablePrivateKey) {
data->rebootAfterDurableVersion = currentVersion;
TraceEvent("RebootWhenDurableSet", data->thisServerID).detail("DurableVersion", data->durableVersion.get()).detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion);
} else if (m.type == MutationRef::SetValue && m.param1 == primaryLocalityPrivateKey) {
data->primaryLocality = BinaryReader::fromStringRef<int8_t>(m.param2, Unversioned());
auto& mLV = data->addVersionToMutationLog( data->data().getLatestVersion() );
data->addMutationToMutationLog( mLV, MutationRef(MutationRef::SetValue, persistPrimaryLocality, m.param2) );
} else {
ASSERT(false); // Unknown private mutation
}
}
void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) {
//TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString());
if (processedCacheStartKey) {
// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
ASSERT((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix));
KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ),
m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ));
data->cachedRangeMap.insert(keys, true);
//Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map
// TODO revisit- we are not splitting the cached ranges based on shards as of now.
if (0) {
auto cachedRanges = data->shards.intersectingRanges(keys);
for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) {
KeyRangeRef intersectingRange = shard.range() & keys;
TraceEvent(SevDebug, "SSPrivateCacheMutationInsertUnexpected", data->thisServerID).detail("Begin", intersectingRange.begin).detail("End", intersectingRange.end);
data->cachedRangeMap.insert(intersectingRange, true);
}
}
processedStartKey = false;
} else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) {
// Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end)
cacheStartKey = m.param1;
processedCacheStartKey = true;
} else {
ASSERT(false); // Unknown private mutation
}
}
};
ACTOR Future<Void> update( StorageServer* data, bool* pReceivedUpdate )
{
state double start;
try {
// If we are disk bound and durableVersion is very old, we need to block updates or we could run out of memory
// This is often referred to as the storage server e-brake (emergency brake)
state double waitStartT = 0;
while ( data->queueSize() >= SERVER_KNOBS->STORAGE_HARD_LIMIT_BYTES && data->durableVersion.get() < data->desiredOldestVersion.get() ) {
if (now() - waitStartT >= 1) {
TraceEvent(SevWarn, "StorageServerUpdateLag", data->thisServerID)
.detail("Version", data->version.get())
.detail("DurableVersion", data->durableVersion.get());
waitStartT = now();
}
data->behind = true;
wait( delayJittered(.005, TaskPriority::TLogPeekReply) );
}
while( data->byteSampleClearsTooLarge.get() ) {
wait( data->byteSampleClearsTooLarge.onChange() );
}
state Reference<ILogSystem::IPeekCursor> cursor = data->logCursor;
loop {
wait( cursor->getMore() );
if(!cursor->isExhausted()) {
break;
}
}
if(cursor->popped() > 0)
throw worker_removed();
++data->counters.updateBatches;
data->lastTLogVersion = cursor->getMaxKnownVersion();
data->versionLag = std::max<int64_t>(0, data->lastTLogVersion - data->version.get());
ASSERT(*pReceivedUpdate == false);
*pReceivedUpdate = true;
start = now();
wait( data->durableVersionLock.take(TaskPriority::TLogPeekReply,1) );
state FlowLock::Releaser holdingDVL( data->durableVersionLock );
if(now() - start > 0.1)
TraceEvent("SSSlowTakeLock1", data->thisServerID).detailf("From", "%016llx", debug_lastLoadBalanceResultEndpointToken).detail("Duration", now() - start).detail("Version", data->version.get());
start = now();
state UpdateEagerReadInfo eager;
state FetchInjectionInfo fii;
state Reference<ILogSystem::IPeekCursor> cloneCursor2;
loop{
state uint64_t changeCounter = data->shardChangeCounter;
bool epochEnd = false;
bool hasPrivateData = false;
bool firstMutation = true;
bool dbgLastMessageWasProtocol = false;
Reference<ILogSystem::IPeekCursor> cloneCursor1 = cursor->cloneNoMore();
cloneCursor2 = cursor->cloneNoMore();
cloneCursor1->setProtocolVersion(data->logProtocol);
for (; cloneCursor1->hasMessage(); cloneCursor1->nextMessage()) {
ArenaReader& cloneReader = *cloneCursor1->reader();
if (LogProtocolMessage::isNextIn(cloneReader)) {
LogProtocolMessage lpm;
cloneReader >> lpm;
//TraceEvent(SevDebug, "SSReadingLPM", data->thisServerID).detail("Mutation", lpm.toString());
dbgLastMessageWasProtocol = true;
cloneCursor1->setProtocolVersion(cloneReader.protocolVersion());
}
else {
MutationRef msg;
cloneReader >> msg;
//TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg.toString());
if (firstMutation && msg.param1.startsWith(systemKeys.end))
hasPrivateData = true;
firstMutation = false;
if (msg.param1 == lastEpochEndPrivateKey) {
epochEnd = true;
ASSERT(dbgLastMessageWasProtocol);
}
eager.addMutation(msg);
dbgLastMessageWasProtocol = false;
}
}
// Any fetchKeys which are ready to transition their shards to the adding,transferred state do so now.
// If there is an epoch end we skip this step, to increase testability and to prevent inserting a version in the middle of a rolled back version range.
while(!hasPrivateData && !epochEnd && !data->readyFetchKeys.empty()) {
auto fk = data->readyFetchKeys.back();
data->readyFetchKeys.pop_back();
fk.send( &fii );
}
for(auto& c : fii.changes)
eager.addMutations(c.mutations);
wait( doEagerReads( data, &eager ) );
if (data->shardChangeCounter == changeCounter) break;
TEST(true); // A fetchKeys completed while we were doing this, so eager might be outdated. Read it again.
// SOMEDAY: Theoretically we could check the change counters of individual shards and retry the reads only selectively
eager = UpdateEagerReadInfo();
}
if(now() - start > 0.1)
TraceEvent("SSSlowTakeLock2", data->thisServerID).detailf("From", "%016llx", debug_lastLoadBalanceResultEndpointToken).detail("Duration", now() - start).detail("Version", data->version.get());
data->updateEagerReads = &eager;
data->debug_inApplyUpdate = true;
state StorageUpdater updater(data->lastVersionWithData, data->restoredVersion);
if (EXPENSIVE_VALIDATION) data->data().atLatest().validate();
validate(data);
state bool injectedChanges = false;
state int changeNum = 0;
state int mutationBytes = 0;
for(; changeNum < fii.changes.size(); changeNum++) {
state int mutationNum = 0;
state VerUpdateRef* pUpdate = &fii.changes[changeNum];
for(; mutationNum < pUpdate->mutations.size(); mutationNum++) {
updater.applyMutation(data, pUpdate->mutations[mutationNum], pUpdate->version);
mutationBytes += pUpdate->mutations[mutationNum].totalSize();
injectedChanges = true;
if(mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) {
mutationBytes = 0;
wait(delay(SERVER_KNOBS->UPDATE_DELAY));
}
}
}
state Version ver = invalidVersion;
cloneCursor2->setProtocolVersion(data->logProtocol);
for (;cloneCursor2->hasMessage(); cloneCursor2->nextMessage()) {
if(mutationBytes > SERVER_KNOBS->DESIRED_UPDATE_BYTES) {
mutationBytes = 0;
//Instead of just yielding, leave time for the storage server to respond to reads
wait(delay(SERVER_KNOBS->UPDATE_DELAY));
}
if (cloneCursor2->version().version > ver) {
ASSERT(cloneCursor2->version().version > data->version.get());
}
auto &rd = *cloneCursor2->reader();
if (cloneCursor2->version().version > ver && cloneCursor2->version().version > data->version.get()) {
++data->counters.updateVersions;
ver = cloneCursor2->version().version;
}
if (LogProtocolMessage::isNextIn(rd)) {
LogProtocolMessage lpm;
rd >> lpm;
data->logProtocol = rd.protocolVersion();
data->storage.changeLogProtocol(ver, data->logProtocol);
cloneCursor2->setProtocolVersion(rd.protocolVersion());
}
else {
MutationRef msg;
rd >> msg;
if (ver != invalidVersion) { // This change belongs to a version < minVersion
DEBUG_MUTATION("SSPeek", ver, msg).detail("ServerID", data->thisServerID);
if (ver == 1) {
TraceEvent("SSPeekMutation", data->thisServerID);
// The following trace event may produce a value with special characters
//TraceEvent("SSPeekMutation", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", cloneCursor2->version().toString());
}
updater.applyMutation(data, msg, ver);
mutationBytes += msg.totalSize();
data->counters.mutationBytes += msg.totalSize();
++data->counters.mutations;
switch(msg.type) {
case MutationRef::SetValue:
++data->counters.setMutations;
break;
case MutationRef::ClearRange:
++data->counters.clearRangeMutations;
break;
case MutationRef::AddValue:
case MutationRef::And:
case MutationRef::AndV2:
case MutationRef::AppendIfFits:
case MutationRef::ByteMax:
case MutationRef::ByteMin:
case MutationRef::Max:
case MutationRef::Min:
case MutationRef::MinV2:
case MutationRef::Or:
case MutationRef::Xor:
case MutationRef::CompareAndClear:
++data->counters.atomicMutations;
break;
}
}
else
TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", cloneCursor2->version().toString());
}
}
if(ver != invalidVersion) {
data->lastVersionWithData = ver;
}
ver = cloneCursor2->version().version - 1;
if(injectedChanges) data->lastVersionWithData = ver;
data->updateEagerReads = NULL;
data->debug_inApplyUpdate = false;
if(ver == invalidVersion && !fii.changes.empty() ) {
ver = updater.currentVersion;
}
if(ver != invalidVersion && ver > data->version.get()) {
// TODO(alexmiller): Update to version tracking.
DEBUG_KEY_RANGE("SSUpdate", ver, KeyRangeRef());
data->mutableData().createNewVersion(ver);
if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get();
data->noRecentUpdates.set(false);
data->lastUpdate = now();
data->version.set( ver ); // Triggers replies to waiting gets for new version(s)
setDataVersion(data->thisServerID, data->version.get());
if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get();
Version maxVersionsInMemory = SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS;
for(int i = 0; i < data->recoveryVersionSkips.size(); i++) {
maxVersionsInMemory += data->recoveryVersionSkips[i].second;
}
// Trigger updateStorage if necessary
Version proposedOldestVersion = std::max(data->version.get(), cursor->getMinKnownCommittedVersion()) - maxVersionsInMemory;
if(data->primaryLocality == tagLocalitySpecial || data->tag.locality == data->primaryLocality) {
proposedOldestVersion = std::max(proposedOldestVersion, data->lastTLogVersion - maxVersionsInMemory);
}
proposedOldestVersion = std::min(proposedOldestVersion, data->version.get()-1);
proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get());
proposedOldestVersion = std::max(proposedOldestVersion, data->desiredOldestVersion.get());
//TraceEvent("StorageServerUpdated", data->thisServerID).detail("Ver", ver).detail("DataVersion", data->version.get())
// .detail("LastTLogVersion", data->lastTLogVersion).detail("NewOldest", data->oldestVersion.get()).detail("DesiredOldest",data->desiredOldestVersion.get())
// .detail("MaxVersionInMemory", maxVersionsInMemory).detail("Proposed", proposedOldestVersion).detail("PrimaryLocality", data->primaryLocality).detail("Tag", data->tag.toString());
while(!data->recoveryVersionSkips.empty() && proposedOldestVersion > data->recoveryVersionSkips.front().first) {
data->recoveryVersionSkips.pop_front();
}
data->desiredOldestVersion.set(proposedOldestVersion);
}
validate(data);
data->logCursor->advanceTo( cloneCursor2->version() );
if(cursor->version().version >= data->lastTLogVersion) {
if(data->behind) {
TraceEvent("StorageServerNoLongerBehind", data->thisServerID).detail("CursorVersion", cursor->version().version).detail("TLogVersion", data->lastTLogVersion);
}
data->behind = false;
}
return Void(); // update will get called again ASAP
} catch (Error& err) {
state Error e = err;
if (e.code() != error_code_worker_removed && e.code() != error_code_please_reboot) {
TraceEvent(SevError, "SSUpdateError", data->thisServerID).error(e).backtrace();
} else if (e.code() == error_code_please_reboot) {
wait( data->durableInProgress );
}
throw e;
}
}
ACTOR Future<Void> updateStorage(StorageServer* data) {
loop {
ASSERT( data->durableVersion.get() == data->storageVersion() );
if (g_network->isSimulated()) {
double endTime = g_simulator.checkDisabled(format("%s/updateStorage", data->thisServerID.toString().c_str()));
if(endTime > now()) {
wait(delay(endTime - now(), TaskPriority::UpdateStorage));
}
}
wait( data->desiredOldestVersion.whenAtLeast( data->storageVersion()+1 ) );
wait( delay(0, TaskPriority::UpdateStorage) );
state Promise<Void> durableInProgress;
data->durableInProgress = durableInProgress.getFuture();
state Version startOldestVersion = data->storageVersion();
state Version newOldestVersion = data->storageVersion();
state Version desiredVersion = data->desiredOldestVersion.get();
state int64_t bytesLeft = SERVER_KNOBS->STORAGE_COMMIT_BYTES;
// Write mutations to storage until we reach the desiredVersion or have written too much (bytesleft)
loop {
state bool done = data->storage.makeVersionMutationsDurable(newOldestVersion, desiredVersion, bytesLeft);
// We want to forget things from these data structures atomically with changing oldestVersion (and "before", since oldestVersion.set() may trigger waiting actors)
// forgetVersionsBeforeAsync visibly forgets immediately (without waiting) but asynchronously frees memory.
Future<Void> finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( newOldestVersion, TaskPriority::UpdateStorage );
data->oldestVersion.set( newOldestVersion );
wait( finishedForgetting );
wait( yield(TaskPriority::UpdateStorage) );
if (done) break;
}
// Set the new durable version as part of the outstanding change set, before commit
if (startOldestVersion != newOldestVersion)
data->storage.makeVersionDurable( newOldestVersion );
debug_advanceMaxCommittedVersion( data->thisServerID, newOldestVersion );
state Future<Void> durable = data->storage.commit();
state Future<Void> durableDelay = Void();
if (bytesLeft > 0) {
durableDelay = delay(SERVER_KNOBS->STORAGE_COMMIT_INTERVAL, TaskPriority::UpdateStorage);
}
wait( durable );
debug_advanceMinCommittedVersion( data->thisServerID, newOldestVersion );
if(newOldestVersion > data->rebootAfterDurableVersion) {
TraceEvent("RebootWhenDurableTriggered", data->thisServerID).detail("NewOldestVersion", newOldestVersion).detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion);
// To avoid brokenPromise error, which is caused by the sender of the durableInProgress (i.e., this process)
// never sets durableInProgress, we should set durableInProgress before send the please_reboot() error.
// Otherwise, in the race situation when storage server receives both reboot and
// brokenPromise of durableInProgress, the worker of the storage server will die.
// We will eventually end up with no worker for storage server role.
// The data distributor's buildTeam() will get stuck in building a team
durableInProgress.sendError(please_reboot());
throw please_reboot();
}
durableInProgress.send(Void());
wait( delay(0, TaskPriority::UpdateStorage) ); //Setting durableInProgess could cause the storage server to shut down, so delay to check for cancellation
// Taking and releasing the durableVersionLock ensures that no eager reads both begin before the commit was effective and
// are applied after we change the durable version. Also ensure that we have to lock while calling changeDurableVersion,
// because otherwise the latest version of mutableData might be partially loaded.
wait( data->durableVersionLock.take() );
data->popVersion( data->durableVersion.get() + 1 );
while (!changeDurableVersion( data, newOldestVersion )) {
if(g_network->check_yield(TaskPriority::UpdateStorage)) {
data->durableVersionLock.release();
wait(delay(0, TaskPriority::UpdateStorage));
wait( data->durableVersionLock.take() );
}
}
data->durableVersionLock.release();
//TraceEvent("StorageServerDurable", data->thisServerID).detail("Version", newOldestVersion);
wait( durableDelay );
}
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
////////////////////////////////// StorageServerDisk ///////////////////////////////////////
#ifndef __INTEL_COMPILER
#pragma region StorageServerDisk
#endif
void StorageServerDisk::makeNewStorageServerDurable() {
storage->set( persistFormat );
storage->set( KeyValueRef(persistID, BinaryWriter::toValue(data->thisServerID, Unversioned())) );
storage->set( KeyValueRef(persistVersion, BinaryWriter::toValue(data->version.get(), Unversioned())) );
storage->set( KeyValueRef(persistShardAssignedKeys.begin.toString(), LiteralStringRef("0")) );
storage->set( KeyValueRef(persistShardAvailableKeys.begin.toString(), LiteralStringRef("0")) );
}
void setAvailableStatus( StorageServer* self, KeyRangeRef keys, bool available ) {
//ASSERT( self->debug_inApplyUpdate );
ASSERT( !keys.empty() );
auto& mLV = self->addVersionToMutationLog( self->data().getLatestVersion() );
KeyRange availableKeys = KeyRangeRef( persistShardAvailableKeys.begin.toString() + keys.begin.toString(), persistShardAvailableKeys.begin.toString() + keys.end.toString() );
//TraceEvent("SetAvailableStatus", self->thisServerID).detail("Version", mLV.version).detail("RangeBegin", availableKeys.begin).detail("RangeEnd", availableKeys.end);
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::ClearRange, availableKeys.begin, availableKeys.end ) );
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::SetValue, availableKeys.begin, available ? LiteralStringRef("1") : LiteralStringRef("0") ) );
if (keys.end != allKeys.end) {
bool endAvailable = self->shards.rangeContaining( keys.end )->value()->isInVersionedData();
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::SetValue, availableKeys.end, endAvailable ? LiteralStringRef("1") : LiteralStringRef("0") ) );
}
}
void setAssignedStatus( StorageServer* self, KeyRangeRef keys, bool nowAssigned ) {
ASSERT( !keys.empty() );
auto& mLV = self->addVersionToMutationLog( self->data().getLatestVersion() );
KeyRange assignedKeys = KeyRangeRef(
persistShardAssignedKeys.begin.toString() + keys.begin.toString(),
persistShardAssignedKeys.begin.toString() + keys.end.toString() );
//TraceEvent("SetAssignedStatus", self->thisServerID).detail("Version", mLV.version).detail("RangeBegin", assignedKeys.begin).detail("RangeEnd", assignedKeys.end);
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::ClearRange, assignedKeys.begin, assignedKeys.end ) );
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::SetValue, assignedKeys.begin,
nowAssigned ? LiteralStringRef("1") : LiteralStringRef("0") ) );
if (keys.end != allKeys.end) {
bool endAssigned = self->shards.rangeContaining( keys.end )->value()->assigned();
self->addMutationToMutationLog( mLV, MutationRef( MutationRef::SetValue, assignedKeys.end, endAssigned ? LiteralStringRef("1") : LiteralStringRef("0") ) );
}
}
void StorageServerDisk::clearRange( KeyRangeRef keys ) {
storage->clear(keys);
}
void StorageServerDisk::writeKeyValue( KeyValueRef kv ) {
storage->set( kv );
}
void StorageServerDisk::writeMutation( MutationRef mutation ) {
// FIXME: DEBUG_MUTATION(debugContext, debugVersion, *m);
if (mutation.type == MutationRef::SetValue) {
storage->set( KeyValueRef(mutation.param1, mutation.param2) );
} else if (mutation.type == MutationRef::ClearRange) {
storage->clear( KeyRangeRef(mutation.param1, mutation.param2) );
} else
ASSERT(false);
}
void StorageServerDisk::writeMutations( MutationListRef mutations, Version debugVersion, const char* debugContext ) {
for(auto m = mutations.begin(); m; ++m) {
DEBUG_MUTATION(debugContext, debugVersion, *m).detail("UID", data->thisServerID);
if (m->type == MutationRef::SetValue) {
storage->set( KeyValueRef(m->param1, m->param2) );
} else if (m->type == MutationRef::ClearRange) {
storage->clear( KeyRangeRef(m->param1, m->param2) );
}
}
}
bool StorageServerDisk::makeVersionMutationsDurable( Version& prevStorageVersion, Version newStorageVersion, int64_t& bytesLeft ) {
if (bytesLeft <= 0) return true;
// Apply mutations from the mutationLog
auto u = data->getMutationLog().upper_bound(prevStorageVersion);
if (u != data->getMutationLog().end() && u->first <= newStorageVersion) {
VersionUpdateRef const& v = u->second;
ASSERT( v.version > prevStorageVersion && v.version <= newStorageVersion );
// TODO(alexmiller): Update to version tracking.
DEBUG_KEY_RANGE("makeVersionMutationsDurable", v.version, KeyRangeRef());
writeMutations(v.mutations, v.version, "makeVersionDurable");
for(auto m=v.mutations.begin(); m; ++m)
bytesLeft -= mvccStorageBytes(*m);
prevStorageVersion = v.version;
return false;
} else {
prevStorageVersion = newStorageVersion;
return true;
}
}
// Update data->storage to persist the changes from (data->storageVersion(),version]
void StorageServerDisk::makeVersionDurable( Version version ) {
storage->set( KeyValueRef(persistVersion, BinaryWriter::toValue(version, Unversioned())) );
//TraceEvent("MakeDurable", data->thisServerID).detail("FromVersion", prevStorageVersion).detail("ToVersion", version);
}
void StorageServerDisk::changeLogProtocol(Version version, ProtocolVersion protocol) {
data->addMutationToMutationLogOrStorage(version, MutationRef(MutationRef::SetValue, persistLogProtocol, BinaryWriter::toValue(protocol, Unversioned())));
}
ACTOR Future<Void> applyByteSampleResult( StorageServer* data, IKeyValueStore* storage, Key begin, Key end, std::vector<Standalone<VectorRef<KeyValueRef>>>* results = NULL) {
state int totalFetches = 0;
state int totalKeys = 0;
state int totalBytes = 0;
loop {
Standalone<RangeResultRef> bs = wait( storage->readRange( KeyRangeRef(begin, end), SERVER_KNOBS->STORAGE_LIMIT_BYTES, SERVER_KNOBS->STORAGE_LIMIT_BYTES ) );
if(results) results->push_back(bs.castTo<VectorRef<KeyValueRef>>());
int rangeSize = bs.expectedSize();
totalFetches++;
totalKeys += bs.size();
totalBytes += rangeSize;
for( int j = 0; j < bs.size(); j++ ) {
KeyRef key = bs[j].key.removePrefix(persistByteSampleKeys.begin);
if(!data->byteSampleClears.rangeContaining(key).value()) {
data->metrics.byteSample.sample.insert( key, BinaryReader::fromStringRef<int32_t>(bs[j].value, Unversioned()), false );
}
}
if( rangeSize >= SERVER_KNOBS->STORAGE_LIMIT_BYTES ) {
Key nextBegin = keyAfter(bs.back().key);
data->byteSampleClears.insert(KeyRangeRef(begin, nextBegin).removePrefix(persistByteSampleKeys.begin), true);
data->byteSampleClearsTooLarge.set(data->byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE);
begin = nextBegin;
if(begin == end) {
break;
}
} else {
data->byteSampleClears.insert(KeyRangeRef(begin.removePrefix(persistByteSampleKeys.begin), end == persistByteSampleKeys.end ? LiteralStringRef("\xff\xff\xff") : end.removePrefix(persistByteSampleKeys.begin)), true);
data->byteSampleClearsTooLarge.set(data->byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE);
break;
}
if(!results) {
wait(delay(SERVER_KNOBS->BYTE_SAMPLE_LOAD_DELAY));
}
}
TraceEvent("RecoveredByteSampleRange", data->thisServerID).detail("Begin", begin).detail("End", end).detail("Fetches", totalFetches).detail("Keys", totalKeys).detail("ReadBytes", totalBytes);
return Void();
}
ACTOR Future<Void> restoreByteSample(StorageServer* data, IKeyValueStore* storage, Promise<Void> byteSampleSampleRecovered, Future<Void> startRestore) {
state std::vector<Standalone<VectorRef<KeyValueRef>>> byteSampleSample;
wait( applyByteSampleResult(data, storage, persistByteSampleSampleKeys.begin, persistByteSampleSampleKeys.end, &byteSampleSample) );
byteSampleSampleRecovered.send(Void());
wait( startRestore );
wait( delay(SERVER_KNOBS->BYTE_SAMPLE_START_DELAY) );
size_t bytes_per_fetch = 0;
// Since the expected size also includes (as of now) the space overhead of the container, we calculate our own number here
for( auto& it : byteSampleSample ) {
for( auto& kv : it ) {
bytes_per_fetch += BinaryReader::fromStringRef<int32_t>(kv.value, Unversioned());
}
}
bytes_per_fetch = (bytes_per_fetch/SERVER_KNOBS->BYTE_SAMPLE_LOAD_PARALLELISM) + 1;
state std::vector<Future<Void>> sampleRanges;
int accumulatedSize = 0;
Key lastStart = persistByteSampleKeys.begin; // make sure the first range starts at the absolute beginning of the byte sample
for( auto& it : byteSampleSample ) {
for( auto& kv : it ) {
if( accumulatedSize >= bytes_per_fetch ) {
accumulatedSize = 0;
Key realKey = kv.key.removePrefix( persistByteSampleKeys.begin );
sampleRanges.push_back( applyByteSampleResult(data, storage, lastStart, realKey) );
lastStart = realKey;
}
accumulatedSize += BinaryReader::fromStringRef<int32_t>(kv.value, Unversioned());
}
}
// make sure that the last range goes all the way to the end of the byte sample
sampleRanges.push_back( applyByteSampleResult(data, storage, lastStart, persistByteSampleKeys.end) );
wait( waitForAll( sampleRanges ) );
TraceEvent("RecoveredByteSampleChunkedRead", data->thisServerID).detail("Ranges",sampleRanges.size());
if( BUGGIFY )
wait( delay( deterministicRandom()->random01() * 10.0 ) );
return Void();
}
ACTOR Future<bool> restoreDurableState( StorageServer* data, IKeyValueStore* storage ) {
state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
state Future<Optional<Value>> fID = storage->readValue(persistID);
state Future<Optional<Value>> fVersion = storage->readValue(persistVersion);
state Future<Optional<Value>> fLogProtocol = storage->readValue(persistLogProtocol);
state Future<Optional<Value>> fPrimaryLocality = storage->readValue(persistPrimaryLocality);
state Future<Standalone<RangeResultRef>> fShardAssigned = storage->readRange(persistShardAssignedKeys);
state Future<Standalone<RangeResultRef>> fShardAvailable = storage->readRange(persistShardAvailableKeys);
state Promise<Void> byteSampleSampleRecovered;
state Promise<Void> startByteSampleRestore;
data->byteSampleRecovery = restoreByteSample(data, storage, byteSampleSampleRecovered, startByteSampleRestore.getFuture());
TraceEvent("ReadingDurableState", data->thisServerID);
wait( waitForAll( std::vector{ fFormat, fID, fVersion, fLogProtocol, fPrimaryLocality } ) );
wait( waitForAll( std::vector{ fShardAssigned, fShardAvailable } ) );
wait( byteSampleSampleRecovered.getFuture() );
TraceEvent("RestoringDurableState", data->thisServerID);
if (!fFormat.get().present()) {
// The DB was never initialized
TraceEvent("DBNeverInitialized", data->thisServerID);
storage->dispose();
data->thisServerID = UID();
data->sk = Key();
return false;
}
if (!persistFormatReadableRange.contains( fFormat.get().get() )) {
TraceEvent(SevError, "UnsupportedDBFormat").detail("Format", fFormat.get().get().toString()).detail("Expected", persistFormat.value.toString());
throw worker_recovery_failed();
}
data->thisServerID = BinaryReader::fromStringRef<UID>(fID.get().get(), Unversioned());
data->sk = serverKeysPrefixFor( data->thisServerID ).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
if (fLogProtocol.get().present())
data->logProtocol = BinaryReader::fromStringRef<ProtocolVersion>(fLogProtocol.get().get(), Unversioned());
if (fPrimaryLocality.get().present())
data->primaryLocality = BinaryReader::fromStringRef<int8_t>(fPrimaryLocality.get().get(), Unversioned());
state Version version = BinaryReader::fromStringRef<Version>( fVersion.get().get(), Unversioned() );
debug_checkRestoredVersion( data->thisServerID, version, "StorageServer" );
data->setInitialVersion( version );
state Standalone<RangeResultRef> available = fShardAvailable.get();
state int availableLoc;
for(availableLoc=0; availableLoc<available.size(); availableLoc++) {
KeyRangeRef keys(
available[availableLoc].key.removePrefix(persistShardAvailableKeys.begin),
availableLoc+1==available.size() ? allKeys.end : available[availableLoc+1].key.removePrefix(persistShardAvailableKeys.begin));
ASSERT( !keys.empty() );
bool nowAvailable = available[availableLoc].value!=LiteralStringRef("0");
/*if(nowAvailable)
TraceEvent("AvailableShard", data->thisServerID).detail("RangeBegin", keys.begin).detail("RangeEnd", keys.end);*/
data->newestAvailableVersion.insert( keys, nowAvailable ? latestVersion : invalidVersion );
wait(yield());
}
state Standalone<RangeResultRef> assigned = fShardAssigned.get();
state int assignedLoc;
for(assignedLoc=0; assignedLoc<assigned.size(); assignedLoc++) {
KeyRangeRef keys(
assigned[assignedLoc].key.removePrefix(persistShardAssignedKeys.begin),
assignedLoc+1==assigned.size() ? allKeys.end : assigned[assignedLoc+1].key.removePrefix(persistShardAssignedKeys.begin));
ASSERT( !keys.empty() );
bool nowAssigned = assigned[assignedLoc].value!=LiteralStringRef("0");
/*if(nowAssigned)
TraceEvent("AssignedShard", data->thisServerID).detail("RangeBegin", keys.begin).detail("RangeEnd", keys.end);*/
changeServerKeys(data, keys, nowAssigned, version, CSK_RESTORE);
if (!nowAssigned) ASSERT( data->newestAvailableVersion.allEqual(keys, invalidVersion) );
wait(yield());
}
wait( delay( 0.0001 ) );
{
// Erase data which isn't available (it is from some fetch at a later version)
// SOMEDAY: Keep track of keys that might be fetching, make sure we don't have any data elsewhere?
for(auto it = data->newestAvailableVersion.ranges().begin(); it != data->newestAvailableVersion.ranges().end(); ++it) {
if (it->value() == invalidVersion) {
KeyRangeRef clearRange(it->begin(), it->end());
// TODO(alexmiller): Figure out how to selectively enable spammy data distribution events.
//DEBUG_KEY_RANGE("clearInvalidVersion", invalidVersion, clearRange);
storage->clear( clearRange );
data->byteSampleApplyClear( clearRange, invalidVersion );
}
}
}
validate(data, true);
startByteSampleRestore.send(Void());
return true;
}
Future<bool> StorageServerDisk::restoreDurableState() {
return ::restoreDurableState(data, storage);
}
//Determines whether a key-value pair should be included in a byte sample
//Also returns size information about the sample
ByteSampleInfo isKeyValueInSample(KeyValueRef keyValue) {
ByteSampleInfo info;
const KeyRef key = keyValue.key;
info.size = key.size() + keyValue.value.size();
uint32_t a = 0;
uint32_t b = 0;
hashlittle2( key.begin(), key.size(), &a, &b );
double probability = (double)info.size / (key.size() + SERVER_KNOBS->BYTE_SAMPLING_OVERHEAD) / SERVER_KNOBS->BYTE_SAMPLING_FACTOR;
info.inSample = a / ((1 << 30) * 4.0) < probability;
info.sampledSize = info.size / std::min(1.0, probability);
return info;
}
void StorageServer::addMutationToMutationLogOrStorage( Version ver, MutationRef m ) {
if (ver != invalidVersion) {
addMutationToMutationLog( addVersionToMutationLog(ver), m );
} else {
storage.writeMutation( m );
byteSampleApplyMutation( m, ver );
}
}
void StorageServer::byteSampleApplySet( KeyValueRef kv, Version ver ) {
// Update byteSample in memory and (eventually) on disk and notify waiting metrics
ByteSampleInfo sampleInfo = isKeyValueInSample(kv);
auto& byteSample = metrics.byteSample.sample;
int64_t delta = 0;
const KeyRef key = kv.key;
auto old = byteSample.find(key);
if (old != byteSample.end()) delta = -byteSample.getMetric(old);
if (sampleInfo.inSample) {
delta += sampleInfo.sampledSize;
byteSample.insert( key, sampleInfo.sampledSize );
addMutationToMutationLogOrStorage( ver, MutationRef(MutationRef::SetValue, key.withPrefix(persistByteSampleKeys.begin), BinaryWriter::toValue( sampleInfo.sampledSize, Unversioned() )) );
} else {
bool any = old != byteSample.end();
if(!byteSampleRecovery.isReady() ) {
if(!byteSampleClears.rangeContaining(key).value()) {
byteSampleClears.insert(key, true);
byteSampleClearsTooLarge.set(byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE);
any = true;
}
}
if (any) {
byteSample.erase(old);
auto diskRange = singleKeyRange(key.withPrefix(persistByteSampleKeys.begin));
addMutationToMutationLogOrStorage( ver, MutationRef(MutationRef::ClearRange, diskRange.begin, diskRange.end) );
}
}
if (delta) metrics.notifyBytes( key, delta );
}
void StorageServer::byteSampleApplyClear( KeyRangeRef range, Version ver ) {
// Update byteSample in memory and (eventually) on disk via the mutationLog and notify waiting metrics
auto& byteSample = metrics.byteSample.sample;
bool any = false;
if(range.begin < allKeys.end) {
//NotifyBytes should not be called for keys past allKeys.end
KeyRangeRef searchRange = KeyRangeRef(range.begin, std::min(range.end, allKeys.end));
counters.sampledBytesCleared += byteSample.sumRange(searchRange.begin, searchRange.end);
auto r = metrics.waitMetricsMap.intersectingRanges(searchRange);
for(auto shard = r.begin(); shard != r.end(); ++shard) {
KeyRangeRef intersectingRange = shard.range() & range;
int64_t bytes = byteSample.sumRange(intersectingRange.begin, intersectingRange.end);
metrics.notifyBytes(shard, -bytes);
any = any || bytes > 0;
}
}
if(range.end > allKeys.end && byteSample.sumRange(std::max(allKeys.end, range.begin), range.end) > 0)
any = true;
if(!byteSampleRecovery.isReady()) {
auto clearRanges = byteSampleClears.intersectingRanges(range);
for(auto it : clearRanges) {
if(!it.value()) {
byteSampleClears.insert(range, true);
byteSampleClearsTooLarge.set(byteSampleClears.size() > SERVER_KNOBS->MAX_BYTE_SAMPLE_CLEAR_MAP_SIZE);
any = true;
break;
}
}
}
if (any) {
byteSample.eraseAsync( range.begin, range.end );
auto diskRange = range.withPrefix( persistByteSampleKeys.begin );
addMutationToMutationLogOrStorage( ver, MutationRef(MutationRef::ClearRange, diskRange.begin, diskRange.end) );
}
}
ACTOR Future<Void> waitMetrics( StorageServerMetrics* self, WaitMetricsRequest req, Future<Void> timeout ) {
state PromiseStream< StorageMetrics > change;
state StorageMetrics metrics = self->getMetrics( req.keys );
state Error error = success();
state bool timedout = false;
if ( !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
TEST( true ); // ShardWaitMetrics return case 1 (quickly)
req.reply.send( metrics );
return Void();
}
{
auto rs = self->waitMetricsMap.modify( req.keys );
for(auto r = rs.begin(); r != rs.end(); ++r)
r->value().push_back( change );
loop {
try {
choose {
when( StorageMetrics c = waitNext( change.getFuture() ) ) {
metrics += c;
// SOMEDAY: validation! The changes here are possibly partial changes (we receive multiple messages per
// update to our requested range). This means that the validation would have to occur after all
// the messages for one clear or set have been dispatched.
/*StorageMetrics m = getMetrics( data, req.keys );
bool b = ( m.bytes != metrics.bytes || m.bytesPerKSecond != metrics.bytesPerKSecond || m.iosPerKSecond != metrics.iosPerKSecond );
if (b) {
printf("keys: '%s' - '%s' @%p\n", printable(req.keys.begin).c_str(), printable(req.keys.end).c_str(), this);
printf("waitMetrics: desync %d (%lld %lld %lld) != (%lld %lld %lld); +(%lld %lld %lld)\n", b, m.bytes, m.bytesPerKSecond, m.iosPerKSecond, metrics.bytes, metrics.bytesPerKSecond, metrics.iosPerKSecond, c.bytes, c.bytesPerKSecond, c.iosPerKSecond);
}*/
}
when( wait( timeout ) ) {
timedout = true;
}
}
} catch (Error& e) {
if( e.code() == error_code_actor_cancelled ) throw; // This is only cancelled when the main loop had exited...no need in this case to clean up self
error = e;
break;
}
if( timedout ) {
TEST( true ); // ShardWaitMetrics return on timeout
//FIXME: instead of using random chance, send wrong_shard_server when the call in from waitMetricsMultiple (requires additional information in the request)
if(deterministicRandom()->random01() < SERVER_KNOBS->WAIT_METRICS_WRONG_SHARD_CHANCE) {
req.reply.sendError( wrong_shard_server() );
} else {
req.reply.send( metrics );
}
break;
}
if ( !req.min.allLessOrEqual( metrics ) || !metrics.allLessOrEqual( req.max ) ) {
TEST( true ); // ShardWaitMetrics return case 2 (delayed)
req.reply.send( metrics );
break;
}
}
wait( delay(0) ); //prevent iterator invalidation of functions sending changes
}
auto rs = self->waitMetricsMap.modify( req.keys );
for(auto i = rs.begin(); i != rs.end(); ++i) {
auto &x = i->value();
for( int j = 0; j < x.size(); j++ ) {
if( x[j] == change ) {
swapAndPop(&x, j);
break;
}
}
}
self->waitMetricsMap.coalesce( req.keys );
if (error.code() != error_code_success ) {
if (error.code() != error_code_wrong_shard_server) throw error;
TEST( true ); // ShardWaitMetrics delayed wrong_shard_server()
req.reply.sendError(error);
}
return Void();
}
Future<Void> StorageServerMetrics::waitMetrics(WaitMetricsRequest req, Future<Void> delay) {
return ::waitMetrics(this, req, delay);
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
/////////////////////////////// Core //////////////////////////////////////
#ifndef __INTEL_COMPILER
#pragma region Core
#endif
ACTOR Future<Void> metricsCore( StorageServer* self, StorageServerInterface ssi ) {
state Future<Void> doPollMetrics = Void();
wait( self->byteSampleRecovery );
self->actors.add(traceCounters("StorageMetrics", self->thisServerID, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self->counters.cc, self->thisServerID.toString() + "/StorageMetrics"));
loop {
choose {
when (WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
if (!self->isReadable( req.keys )) {
TEST( true ); // waitMetrics immediate wrong_shard_server()
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->actors.add( self->metrics.waitMetrics( req, delayJittered( SERVER_KNOBS->STORAGE_METRIC_TIMEOUT ) ) );
}
}
when (SplitMetricsRequest req = waitNext(ssi.splitMetrics.getFuture())) {
if (!self->isReadable( req.keys )) {
TEST( true ); // splitMetrics immediate wrong_shard_server()
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.splitMetrics( req );
}
}
when (GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
StorageBytes sb = self->storage.getStorageBytes();
self->metrics.getStorageMetrics( req, sb, self->counters.bytesInput.getRate(), self->versionLag, self->lastUpdate );
}
when(ReadHotSubRangeRequest req = waitNext(ssi.getReadHotRanges.getFuture())) {
if (!self->isReadable(req.keys)) {
TEST(true); // readHotSubRanges immediate wrong_shard_server()
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->metrics.getReadHotRanges(req);
}
}
when (wait(doPollMetrics) ) {
self->metrics.poll();
doPollMetrics = delay(SERVER_KNOBS->STORAGE_SERVER_POLL_METRICS_DELAY);
}
}
}
}
ACTOR Future<Void> logLongByteSampleRecovery(Future<Void> recovery) {
choose {
when(wait(recovery)) {}
when(wait(delay(SERVER_KNOBS->LONG_BYTE_SAMPLE_RECOVERY_DELAY))) {
TraceEvent(g_network->isSimulated() ? SevWarn : SevWarnAlways, "LongByteSampleRecovery");
}
}
return Void();
}
ACTOR Future<Void> checkBehind( StorageServer* self ) {
state int behindCount = 0;
loop {
wait( delay(SERVER_KNOBS->BEHIND_CHECK_DELAY) );
state Transaction tr(self->cx);
loop {
try {
Version readVersion = wait( tr.getRawReadVersion() );
if( readVersion > self->version.get() + SERVER_KNOBS->BEHIND_CHECK_VERSIONS ) {
behindCount++;
} else {
behindCount = 0;
}
self->versionBehind = behindCount >= SERVER_KNOBS->BEHIND_CHECK_COUNT;
break;
} catch( Error &e ) {
wait(tr.onError(e));
}
}
}
}
ACTOR Future<Void> serveGetValueRequests( StorageServer* self, FutureStream<GetValueRequest> getValue ) {
loop {
GetValueRequest req = waitNext(getValue);
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
if( req.debugID.present() )
g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "storageServer.received"); //.detail("TaskID", g_network->getCurrentTask());
if (SHORT_CIRCUT_ACTUAL_STORAGE && normalKeys.contains(req.key))
req.reply.send(GetValueReply());
else
self->actors.add(self->readGuard(req , getValueQ));
}
}
ACTOR Future<Void> serveGetKeyValuesRequests( StorageServer* self, FutureStream<GetKeyValuesRequest> getKeyValues ) {
loop {
GetKeyValuesRequest req = waitNext(getKeyValues);
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
self->actors.add(self->readGuard(req, getKeyValuesQ));
}
}
ACTOR Future<Void> serveGetKeyRequests( StorageServer* self, FutureStream<GetKeyRequest> getKey ) {
loop {
GetKeyRequest req = waitNext(getKey);
// Warning: This code is executed at extremely high priority (TaskPriority::LoadBalancedEndpoint), so downgrade before doing real work
self->actors.add(self->readGuard(req , getKeyQ));
}
}
ACTOR Future<Void> serveWatchValueRequests( StorageServer* self, FutureStream<WatchValueRequest> watchValue ) {
loop {
WatchValueRequest req = waitNext(watchValue);
// TODO: fast load balancing?
// SOMEDAY: combine watches for the same key/value into a single watch
self->actors.add(self->readGuard(req, watchValueQ));
}
}
ACTOR Future<Void> storageServerCore( StorageServer* self, StorageServerInterface ssi )
{
state Future<Void> doUpdate = Void();
state bool updateReceived = false; // true iff the current update() actor assigned to doUpdate has already received an update from the tlog
state double lastLoopTopTime = now();
state Future<Void> dbInfoChange = Void();
state Future<Void> checkLastUpdate = Void();
state Future<Void> updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
self->actors.add(updateStorage(self));
self->actors.add(waitFailureServer(ssi.waitFailure.getFuture()));
self->actors.add(self->otherError.getFuture());
self->actors.add(metricsCore(self, ssi));
self->actors.add(logLongByteSampleRecovery(self->byteSampleRecovery));
self->actors.add(checkBehind(self));
self->actors.add(serveGetValueRequests(self, ssi.getValue.getFuture()));
self->actors.add(serveGetKeyValuesRequests(self, ssi.getKeyValues.getFuture()));
self->actors.add(serveGetKeyRequests(self, ssi.getKey.getFuture()));
self->actors.add(serveWatchValueRequests(self, ssi.watchValue.getFuture()));
self->actors.add(traceRole(Role::STORAGE_SERVER, ssi.id()));
self->transactionTagCounter.startNewInterval(self->thisServerID);
self->actors.add(recurring([&](){ self->transactionTagCounter.startNewInterval(self->thisServerID); }, SERVER_KNOBS->READ_TAG_MEASUREMENT_INTERVAL));
self->coreStarted.send( Void() );
loop {
++self->counters.loops;
double loopTopTime = now();
double elapsedTime = loopTopTime - lastLoopTopTime;
if( elapsedTime > 0.050 ) {
if (deterministicRandom()->random01() < 0.01)
TraceEvent(SevWarn, "SlowSSLoopx100", self->thisServerID).detail("Elapsed", elapsedTime);
}
lastLoopTopTime = loopTopTime;
choose {
when( wait( checkLastUpdate ) ) {
if(now() - self->lastUpdate >= CLIENT_KNOBS->NO_RECENT_UPDATES_DURATION) {
self->noRecentUpdates.set(true);
checkLastUpdate = delay(CLIENT_KNOBS->NO_RECENT_UPDATES_DURATION);
} else {
checkLastUpdate = delay( std::max(CLIENT_KNOBS->NO_RECENT_UPDATES_DURATION-(now()-self->lastUpdate), 0.1) );
}
}
when( wait( dbInfoChange ) ) {
TEST( self->logSystem ); // shardServer dbInfo changed
dbInfoChange = self->db->onChange();
if( self->db->get().recoveryState >= RecoveryState::ACCEPTING_COMMITS ) {
self->logSystem = ILogSystem::fromServerDBInfo( self->thisServerID, self->db->get() );
if (self->logSystem) {
if(self->db->get().logSystemConfig.recoveredAt.present()) {
self->poppedAllAfter = self->db->get().logSystemConfig.recoveredAt.get();
}
self->logCursor = self->logSystem->peekSingle( self->thisServerID, self->version.get() + 1, self->tag, self->history );
self->popVersion( self->durableVersion.get() + 1, true );
}
// If update() is waiting for results from the tlog, it might never get them, so needs to be cancelled. But if it is waiting later,
// cancelling it could cause problems (e.g. fetchKeys that already committed to transitioning to waiting state)
if (!updateReceived) {
doUpdate = Void();
}
}
Optional<LatencyBandConfig> newLatencyBandConfig = self->db->get().latencyBandConfig;
if(newLatencyBandConfig.present() != self->latencyBandConfig.present()
|| (newLatencyBandConfig.present() && newLatencyBandConfig.get().readConfig != self->latencyBandConfig.get().readConfig))
{
self->latencyBandConfig = newLatencyBandConfig;
self->counters.readLatencyBands.clearBands();
TraceEvent("LatencyBandReadUpdatingConfig").detail("Present", newLatencyBandConfig.present());
if(self->latencyBandConfig.present()) {
for(auto band : self->latencyBandConfig.get().readConfig.bands) {
self->counters.readLatencyBands.addThreshold(band);
}
}
}
}
when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) {
if (req.mode == GetShardStateRequest::NO_WAIT ) {
if( self->isReadable( req.keys ) )
req.reply.send(GetShardStateReply{ self->version.get(), self->durableVersion.get() });
else
req.reply.sendError(wrong_shard_server());
} else {
self->actors.add( getShardStateQ( self, req ) );
}
}
when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) {
getQueuingMetrics(self, req);
}
when( ReplyPromise<KeyValueStoreType> reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) {
reply.send( self->storage.getKeyValueStoreType() );
}
when( wait(doUpdate) ) {
updateReceived = false;
if (!self->logSystem)
doUpdate = Never();
else
doUpdate = update( self, &updateReceived );
}
when(wait(updateProcessStatsTimer)) {
updateProcessStats(self);
updateProcessStatsTimer = delay(SERVER_KNOBS->FASTRESTORE_UPDATE_PROCESS_STATS_INTERVAL);
}
when(wait(self->actors.getResult())) {}
}
}
}
bool storageServerTerminated(StorageServer& self, IKeyValueStore* persistentData, Error const& e) {
self.shuttingDown = true;
// Clearing shards shuts down any fetchKeys actors; these may do things on cancellation that are best done with self still valid
self.shards.insert( allKeys, Reference<ShardInfo>() );
// Dispose the IKVS (destroying its data permanently) only if this shutdown is definitely permanent. Otherwise just close it.
if (e.code() == error_code_please_reboot) {
// do nothing.
} else if (e.code() == error_code_worker_removed || e.code() == error_code_recruitment_failed) {
persistentData->dispose();
} else {
persistentData->close();
}
if ( e.code() == error_code_worker_removed ||
e.code() == error_code_recruitment_failed ||
e.code() == error_code_file_not_found ||
e.code() == error_code_actor_cancelled )
{
TraceEvent("StorageServerTerminated", self.thisServerID).error(e, true);
return true;
} else
return false;
}
ACTOR Future<Void> memoryStoreRecover(IKeyValueStore* store, Reference<ClusterConnectionFile> connFile, UID id)
{
if (store->getType() != KeyValueStoreType::MEMORY || connFile.getPtr() == nullptr) {
return Never();
}
// create a temp client connect to DB
Database cx = Database::createDatabase(connFile, Database::API_VERSION_LATEST);
state Transaction tr( cx );
state int noCanRemoveCount = 0;
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state bool canRemove = wait( canRemoveStorageServer( &tr, id ) );
if (!canRemove) {
TEST(true); // it's possible that the caller had a transaction in flight that assigned keys to the server. Wait for it to reverse its mistake.
wait( delayJittered(SERVER_KNOBS->REMOVE_RETRY_DELAY, TaskPriority::UpdateStorage) );
tr.reset();
TraceEvent("RemoveStorageServerRetrying").detail("Count", noCanRemoveCount++).detail("ServerID", id).detail("CanRemove", canRemove);
} else {
return Void();
}
} catch (Error& e) {
state Error err = e;
wait(tr.onError(e));
TraceEvent("RemoveStorageServerRetrying").error(err);
}
}
}
ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Tag seedTag, ReplyPromise<InitializeStorageReply> recruitReply,
Reference<AsyncVar<ServerDBInfo>> db, std::string folder )
{
state StorageServer self(persistentData, db, ssi);
self.sk = serverKeysPrefixFor( self.thisServerID ).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
self.folder = folder;
try {
wait( self.storage.init() );
wait( self.storage.commit() );
if (seedTag == invalidTag) {
std::pair<Version, Tag> verAndTag = wait( addStorageServer(self.cx, ssi) ); // Might throw recruitment_failed in case of simultaneous master failure
self.tag = verAndTag.second;
self.setInitialVersion( verAndTag.first-1 );
} else {
self.tag = seedTag;
}
self.storage.makeNewStorageServerDurable();
wait( self.storage.commit() );
TraceEvent("StorageServerInit", ssi.id()).detail("Version", self.version.get()).detail("SeedTag", seedTag.toString());
InitializeStorageReply rep;
rep.interf = ssi;
rep.addedVersion = self.version.get();
recruitReply.send(rep);
self.byteSampleRecovery = Void();
wait( storageServerCore(&self, ssi) );
throw internal_error();
} catch (Error& e) {
// If we die with an error before replying to the recruitment request, send the error to the recruiter (ClusterController, and from there to the DataDistributionTeamCollection)
if (!recruitReply.isSet())
recruitReply.sendError( recruitment_failed() );
if (storageServerTerminated(self, persistentData, e))
return Void();
throw e;
}
}
ACTOR Future<Void> replaceInterface( StorageServer* self, StorageServerInterface ssi )
{
state Transaction tr(self->cx);
loop {
state Future<Void> infoChanged = self->db->onChange();
state Reference<ProxyInfo> proxies( new ProxyInfo(self->db->get().client.proxies) );
choose {
when( GetStorageServerRejoinInfoReply _rep = wait( proxies->size() ? basicLoadBalance( proxies, &MasterProxyInterface::getStorageServerRejoinInfo, GetStorageServerRejoinInfoRequest(ssi.id(), ssi.locality.dcId()) ) : Never() ) ) {
state GetStorageServerRejoinInfoReply rep = _rep;
try {
tr.reset();
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setVersion( rep.version );
tr.addReadConflictRange(singleKeyRange(serverListKeyFor(ssi.id())));
tr.addReadConflictRange(singleKeyRange(serverTagKeyFor(ssi.id())));
tr.addReadConflictRange(serverTagHistoryRangeFor(ssi.id()));
tr.addReadConflictRange(singleKeyRange(tagLocalityListKeyFor(ssi.locality.dcId())));
tr.set(serverListKeyFor(ssi.id()), serverListValue(ssi));
if(rep.newLocality) {
tr.addReadConflictRange(tagLocalityListKeys);
tr.set( tagLocalityListKeyFor(ssi.locality.dcId()), tagLocalityListValue(rep.newTag.get().locality) );
}
if(rep.newTag.present()) {
KeyRange conflictRange = singleKeyRange(serverTagConflictKeyFor(rep.newTag.get()));
tr.addReadConflictRange( conflictRange );
tr.addWriteConflictRange( conflictRange );
tr.setOption(FDBTransactionOptions::FIRST_IN_BATCH);
tr.set( serverTagKeyFor(ssi.id()), serverTagValue(rep.newTag.get()) );
tr.atomicOp( serverTagHistoryKeyFor(ssi.id()), serverTagValue(rep.tag), MutationRef::SetVersionstampedKey );
}
if(rep.history.size() && rep.history.back().first < self->version.get()) {
tr.clear(serverTagHistoryRangeBefore(ssi.id(), self->version.get()));
}
choose {
when ( wait( tr.commit() ) ) {
self->history = rep.history;
if(rep.newTag.present()) {
self->tag = rep.newTag.get();
self->history.insert(self->history.begin(), std::make_pair(tr.getCommittedVersion(), rep.tag));
} else {
self->tag = rep.tag;
}
self->allHistory = self->history;
TraceEvent("SSTag", self->thisServerID).detail("MyTag", self->tag.toString());
for(auto it : self->history) {
TraceEvent("SSHistory", self->thisServerID).detail("Ver", it.first).detail("Tag", it.second.toString());
}
if(self->history.size() && BUGGIFY) {
TraceEvent("SSHistoryReboot", self->thisServerID);
throw please_reboot();
}
break;
}
when ( wait(infoChanged) ) {}
}
} catch (Error& e) {
wait( tr.onError(e) );
}
}
when ( wait(infoChanged) ) {}
}
}
return Void();
}
ACTOR Future<Void> storageServer( IKeyValueStore* persistentData, StorageServerInterface ssi, Reference<AsyncVar<ServerDBInfo>> db, std::string folder, Promise<Void> recovered, Reference<ClusterConnectionFile> connFile)
{
state StorageServer self(persistentData, db, ssi);
self.folder = folder;
self.sk = serverKeysPrefixFor( self.thisServerID ).withPrefix(systemKeys.begin); // FFFF/serverKeys/[this server]/
try {
state double start = now();
TraceEvent("StorageServerRebootStart", self.thisServerID);
wait(self.storage.init());
choose {
//after a rollback there might be uncommitted changes.
//for memory storage engine type, wait until recovery is done before commit
when( wait(self.storage.commit())) {}
when( wait(memoryStoreRecover (persistentData, connFile, self.thisServerID))) {
TraceEvent("DisposeStorageServer", self.thisServerID);
throw worker_removed();
}
}
bool ok = wait( self.storage.restoreDurableState() );
if (!ok) {
if(recovered.canBeSet()) recovered.send(Void());
return Void();
}
TraceEvent("SSTimeRestoreDurableState", self.thisServerID).detail("TimeTaken", now() - start);
ASSERT( self.thisServerID == ssi.id() );
TraceEvent("StorageServerReboot", self.thisServerID)
.detail("Version", self.version.get());
if(recovered.canBeSet()) recovered.send(Void());
wait( replaceInterface( &self, ssi ) );
TraceEvent("StorageServerStartingCore", self.thisServerID).detail("TimeTaken", now() - start);
//wait( delay(0) ); // To make sure self->zkMasterInfo.onChanged is available to wait on
wait( storageServerCore(&self, ssi) );
throw internal_error();
} catch (Error& e) {
if(recovered.canBeSet()) recovered.send(Void());
if (storageServerTerminated(self, persistentData, e))
return Void();
throw e;
}
}
#ifndef __INTEL_COMPILER
#pragma endregion
#endif
/*
4 Reference count
4 priority
24 pointers
8 lastUpdateVersion
2 updated, replacedPointer
--
42 PTree overhead
8 Version insertVersion
--
50 VersionedMap overhead
12 KeyRef
12 ValueRef
1 isClear
--
25 payload
50 overhead
25 payload
21 structure padding
32 allocator rounds up
---
128 allocated
To reach 64, need to save: 11 bytes + all padding
Possibilities:
-8 Combine lastUpdateVersion, insertVersion?
-2 Fold together updated, replacedPointer, isClear bits
-3 Fold away updated, replacedPointer, isClear
-8 Move value lengths into arena
-4 Replace priority with H(pointer)
-12 Compress pointers (using special allocator)
-4 Modular lastUpdateVersion (make sure no node survives 4 billion updates)
*/
void versionedMapTest() {
VersionedMap<int,int> vm;
printf("SS Ptree node is %zu bytes\n", sizeof( StorageServer::VersionedData::PTreeT ) );
const int NSIZE = sizeof(VersionedMap<int,int>::PTreeT);
const int ASIZE = NSIZE <= 64 ? 64 : nextFastAllocatedSize(NSIZE);
auto before = FastAllocator< ASIZE >::getTotalMemory();
for(int v=1; v<=1000; ++v) {
vm.createNewVersion(v);
for(int i=0; i<1000; i++) {
int k = deterministicRandom()->randomInt(0, 2000000);
/*for(int k2=k-5; k2<k+5; k2++)
if (vm.atLatest().find(k2) != vm.atLatest().end())
vm.erase(k2);*/
vm.erase( k-5, k+5 );
vm.insert( k, v );
}
}
auto after = FastAllocator< ASIZE >::getTotalMemory();
int count = 0;
for(auto i = vm.atLatest().begin(); i != vm.atLatest().end(); ++i)
++count;
printf("PTree node is %d bytes, allocated as %d bytes\n", NSIZE, ASIZE);
printf("%d distinct after %d insertions\n", count, 1000*1000);
printf("Memory used: %f MB\n",
(after - before)/ 1e6);
}