foundationdb/fdbclient/SpecialKeySpace.actor.cpp

2699 lines
115 KiB
C++

/*
* SpecialKeySpace.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2020 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "boost/lexical_cast.hpp"
#include "boost/algorithm/string.hpp"
#include <time.h>
#include <msgpack.hpp>
#include <exception>
#include "fdbclient/ActorLineageProfiler.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/ProcessInterface.h"
#include "fdbclient/GlobalConfig.actor.h"
#include "fdbclient/SpecialKeySpace.actor.h"
#include "flow/Arena.h"
#include "flow/UnitTest.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/StatusClient.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace {
const std::string kTracingTransactionIdKey = "transaction_id";
const std::string kTracingTokenKey = "token";
// Max version we can set for minRequiredCommitVersionKey,
// making sure the cluster can still be alive for 1000 years after the recovery
const Version maxAllowedVerion =
std::numeric_limits<int64_t>::max() - 1 - CLIENT_KNOBS->VERSIONS_PER_SECOND * 3600 * 24 * 365 * 1000;
static bool isAlphaNumeric(const std::string& key) {
// [A-Za-z0-9_]+
if (!key.size())
return false;
for (const char& c : key) {
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'))
return false;
}
return true;
}
} // namespace
std::unordered_map<SpecialKeySpace::MODULE, KeyRange> SpecialKeySpace::moduleToBoundary = {
{ SpecialKeySpace::MODULE::TRANSACTION,
KeyRangeRef(LiteralStringRef("\xff\xff/transaction/"), LiteralStringRef("\xff\xff/transaction0")) },
{ SpecialKeySpace::MODULE::WORKERINTERFACE,
KeyRangeRef(LiteralStringRef("\xff\xff/worker_interfaces/"), LiteralStringRef("\xff\xff/worker_interfaces0")) },
{ SpecialKeySpace::MODULE::STATUSJSON, singleKeyRange(LiteralStringRef("\xff\xff/status/json")) },
{ SpecialKeySpace::MODULE::CONNECTIONSTRING, singleKeyRange(LiteralStringRef("\xff\xff/connection_string")) },
{ SpecialKeySpace::MODULE::CLUSTERFILEPATH, singleKeyRange(LiteralStringRef("\xff\xff/cluster_file_path")) },
{ SpecialKeySpace::MODULE::METRICS,
KeyRangeRef(LiteralStringRef("\xff\xff/metrics/"), LiteralStringRef("\xff\xff/metrics0")) },
{ SpecialKeySpace::MODULE::MANAGEMENT,
KeyRangeRef(LiteralStringRef("\xff\xff/management/"), LiteralStringRef("\xff\xff/management0")) },
{ SpecialKeySpace::MODULE::ERRORMSG, singleKeyRange(LiteralStringRef("\xff\xff/error_message")) },
{ SpecialKeySpace::MODULE::CONFIGURATION,
KeyRangeRef(LiteralStringRef("\xff\xff/configuration/"), LiteralStringRef("\xff\xff/configuration0")) },
{ SpecialKeySpace::MODULE::GLOBALCONFIG,
KeyRangeRef(LiteralStringRef("\xff\xff/global_config/"), LiteralStringRef("\xff\xff/global_config0")) },
{ SpecialKeySpace::MODULE::TRACING,
KeyRangeRef(LiteralStringRef("\xff\xff/tracing/"), LiteralStringRef("\xff\xff/tracing0")) },
{ SpecialKeySpace::MODULE::ACTORLINEAGE,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_lineage/"), LiteralStringRef("\xff\xff/actor_lineage0")) },
{ SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF,
KeyRangeRef(LiteralStringRef("\xff\xff/actor_profiler_conf/"),
LiteralStringRef("\xff\xff/actor_profiler_conf0")) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::managementApiCommandToRange = {
{ "exclude",
KeyRangeRef(LiteralStringRef("excluded/"), LiteralStringRef("excluded0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "failed",
KeyRangeRef(LiteralStringRef("failed/"), LiteralStringRef("failed0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "excludedlocality",
KeyRangeRef(LiteralStringRef("excluded_locality/"), LiteralStringRef("excluded_locality0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "failedlocality",
KeyRangeRef(LiteralStringRef("failed_locality/"), LiteralStringRef("failed_locality0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "lock", singleKeyRange(LiteralStringRef("db_locked")).withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "consistencycheck",
singleKeyRange(LiteralStringRef("consistency_check_suspended"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "coordinators",
KeyRangeRef(LiteralStringRef("coordinators/"), LiteralStringRef("coordinators0"))
.withPrefix(moduleToBoundary[MODULE::CONFIGURATION].begin) },
{ "advanceversion",
singleKeyRange(LiteralStringRef("min_required_commit_version"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "profile",
KeyRangeRef(LiteralStringRef("profiling/"), LiteralStringRef("profiling0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "maintenance",
KeyRangeRef(LiteralStringRef("maintenance/"), LiteralStringRef("maintenance0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) },
{ "datadistribution",
KeyRangeRef(LiteralStringRef("data_distribution/"), LiteralStringRef("data_distribution0"))
.withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin) }
};
std::unordered_map<std::string, KeyRange> SpecialKeySpace::actorLineageApiCommandToRange = {
{ "state",
KeyRangeRef(LiteralStringRef("state/"), LiteralStringRef("state0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) },
{ "time",
KeyRangeRef(LiteralStringRef("time/"), LiteralStringRef("time0"))
.withPrefix(moduleToBoundary[MODULE::ACTORLINEAGE].begin) }
};
std::set<std::string> SpecialKeySpace::options = { "excluded/force",
"failed/force",
"excluded_locality/force",
"failed_locality/force" };
std::set<std::string> SpecialKeySpace::tracingOptions = { kTracingTransactionIdKey, kTracingTokenKey };
RangeResult rywGetRange(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr, const RangeResult& res);
// This function will move the given KeySelector as far as possible to the standard form:
// orEqual == false && offset == 1 (Standard form)
// If the corresponding key is not in the underlying key range, it will move over the range
// The cache object is used to cache the first read result from the rpc call during the key resolution,
// then when we need to do key resolution or result filtering,
// we, instead of rpc call, read from this cache object have consistent results
ACTOR Future<Void> moveKeySelectorOverRangeActor(const SpecialKeyRangeReadImpl* skrImpl,
ReadYourWritesTransaction* ryw,
KeySelector* ks,
Optional<RangeResult>* cache) {
// should be removed before calling
ASSERT(!ks->orEqual);
// never being called if KeySelector is already normalized
ASSERT(ks->offset != 1);
state Key startKey(skrImpl->getKeyRange().begin);
state Key endKey(skrImpl->getKeyRange().end);
state RangeResult result;
if (ks->offset < 1) {
// less than the given key
if (skrImpl->getKeyRange().contains(ks->getKey()))
endKey = ks->getKey();
} else {
// greater than the given key
if (skrImpl->getKeyRange().contains(ks->getKey()))
startKey = ks->getKey();
}
// Note : startKey never equals endKey here
ASSERT(startKey < endKey);
TraceEvent(SevDebug, "NormalizeKeySelector")
.detail("OriginalKey", ks->getKey())
.detail("OriginalOffset", ks->offset)
.detail("SpecialKeyRangeStart", skrImpl->getKeyRange().begin)
.detail("SpecialKeyRangeEnd", skrImpl->getKeyRange().end);
GetRangeLimits limitsHint(ks->offset >= 1 ? ks->offset : 1 - ks->offset);
if (skrImpl->isAsync()) {
const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(skrImpl);
RangeResult result_ = wait(ptr->getRange(ryw, KeyRangeRef(startKey, endKey), limitsHint, cache));
result = result_;
} else {
RangeResult result_ = wait(skrImpl->getRange(ryw, KeyRangeRef(startKey, endKey), limitsHint));
result = result_;
}
if (result.size() == 0) {
TraceEvent(SevDebug, "ZeroElementsIntheRange").detail("Start", startKey).detail("End", endKey);
return Void();
}
// Note : KeySelector::setKey has byte limit according to the knobs, customize it if needed
if (ks->offset < 1) {
if (result.size() >= 1 - ks->offset) {
ks->setKey(KeyRef(ks->arena(), result[result.size() - (1 - ks->offset)].key));
ks->offset = 1;
} else {
ks->setKey(KeyRef(ks->arena(), result[0].key));
ks->offset += result.size();
}
} else {
if (result.size() >= ks->offset) {
ks->setKey(KeyRef(ks->arena(), result[ks->offset - 1].key));
ks->offset = 1;
} else {
// TODO : the keyAfter will just return if key == \xff\xff
ks->setKey(KeyRef(ks->arena(), keyAfter(result[result.size() - 1].key)));
ks->offset -= result.size();
}
}
TraceEvent(SevDebug, "NormalizeKeySelector")
.detail("NormalizedKey", ks->getKey())
.detail("NormalizedOffset", ks->offset)
.detail("SpecialKeyRangeStart", skrImpl->getKeyRange().begin)
.detail("SpecialKeyRangeEnd", skrImpl->getKeyRange().end);
return Void();
}
// This function will normalize the given KeySelector to a standard KeySelector:
// orEqual == false && offset == 1 (Standard form)
// If the corresponding key is outside the whole space, it will move to the begin or the end
// It does have overhead here since we query all keys twice in the worst case.
// However, moving the KeySelector while handling other parameters like limits makes the code much more complex and hard
// to maintain; Thus, separate each part to make the code easy to understand and more compact
// Boundary is the range of the legal key space, which, by default is the range of the module
// And (\xff\xff, \xff\xff\xff) if SPECIAL_KEY_SPACE_RELAXED is turned on
ACTOR Future<Void> normalizeKeySelectorActor(SpecialKeySpace* sks,
ReadYourWritesTransaction* ryw,
KeySelector* ks,
KeyRangeRef boundary,
int* actualOffset,
RangeResult* result,
Optional<RangeResult>* cache) {
// If offset < 1, where we need to move left, iter points to the range containing at least one smaller key
// (It's a wasting of time to walk through the range whose begin key is same as ks->key)
// (rangeContainingKeyBefore itself handles the case where ks->key == Key())
// Otherwise, we only need to move right if offset > 1, iter points to the range containing the key
// Since boundary.end is always a key in the RangeMap, it is always safe to move right
state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::iterator iter =
ks->offset < 1 ? sks->getReadImpls().rangeContainingKeyBefore(ks->getKey())
: sks->getReadImpls().rangeContaining(ks->getKey());
while ((ks->offset < 1 && iter->begin() >= boundary.begin) || (ks->offset > 1 && iter->begin() < boundary.end)) {
if (iter->value() != nullptr) {
wait(moveKeySelectorOverRangeActor(iter->value(), ryw, ks, cache));
}
// Check if we can still move the iterator left
if (ks->offset < 1) {
if (iter == sks->getReadImpls().ranges().begin()) {
break;
} else {
--iter;
}
} else if (ks->offset > 1) {
// Always safe to move right
++iter;
}
}
*actualOffset = ks->offset;
if (!ks->isFirstGreaterOrEqual()) {
TraceEvent(SevDebug, "ReadToBoundary")
.detail("TerminateKey", ks->getKey())
.detail("TerminateOffset", ks->offset);
// If still not normalized after moving to the boundary,
// let key selector clamp up to the boundary
if (ks->offset < 1) {
result->readToBegin = true;
ks->setKey(boundary.begin);
} else {
result->readThroughEnd = true;
ks->setKey(boundary.end);
}
ks->offset = 1;
}
return Void();
}
SpecialKeySpace::SpecialKeySpace(KeyRef spaceStartKey, KeyRef spaceEndKey, bool testOnly)
: readImpls(nullptr, spaceEndKey),
modules(testOnly ? SpecialKeySpace::MODULE::TESTONLY : SpecialKeySpace::MODULE::UNKNOWN, spaceEndKey),
writeImpls(nullptr, spaceEndKey), range(KeyRangeRef(spaceStartKey, spaceEndKey)) {
// Default begin of KeyRangeMap is Key(), insert the range to update start key
readImpls.insert(range, nullptr);
writeImpls.insert(range, nullptr);
if (!testOnly) {
// testOnly is used in the correctness workload
modulesBoundaryInit();
}
}
void SpecialKeySpace::modulesBoundaryInit() {
for (const auto& pair : moduleToBoundary) {
ASSERT(range.contains(pair.second));
// Make sure the module is not overlapping with any registered read modules
// Note: same like ranges, one module's end cannot be another module's start, relax the condition if needed
ASSERT(modules.rangeContaining(pair.second.begin) == modules.rangeContaining(pair.second.end) &&
modules[pair.second.begin] == SpecialKeySpace::MODULE::UNKNOWN);
modules.insert(pair.second, pair.first);
// Note: Due to underlying implementation, the insertion here is important to make cross_module_read being
// handled correctly
readImpls.insert(pair.second, nullptr);
writeImpls.insert(pair.second, nullptr);
}
}
ACTOR Future<RangeResult> SpecialKeySpace::checkRYWValid(SpecialKeySpace* sks,
ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Reverse reverse) {
ASSERT(ryw);
choose {
when(RangeResult result =
wait(SpecialKeySpace::getRangeAggregationActor(sks, ryw, begin, end, limits, reverse))) {
return result;
}
when(wait(ryw->resetFuture())) { throw internal_error(); }
}
}
ACTOR Future<RangeResult> SpecialKeySpace::getRangeAggregationActor(SpecialKeySpace* sks,
ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Reverse reverse) {
// This function handles ranges which cover more than one keyrange and aggregates all results
// KeySelector, GetRangeLimits and reverse are all handled here
state RangeResult result;
state RangeResult pairs;
state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::iterator iter;
state int actualBeginOffset;
state int actualEndOffset;
state KeyRangeRef moduleBoundary;
// used to cache result from potential first read
state Optional<RangeResult> cache;
if (ryw->specialKeySpaceRelaxed()) {
moduleBoundary = sks->range;
} else {
auto beginIter = sks->getModules().rangeContaining(begin.getKey());
if (beginIter->begin() <= end.getKey() && end.getKey() <= beginIter->end()) {
if (beginIter->value() == SpecialKeySpace::MODULE::UNKNOWN)
throw special_keys_no_module_found();
else
moduleBoundary = beginIter->range();
} else {
TraceEvent(SevInfo, "SpecialKeyCrossModuleRead")
.detail("Begin", begin)
.detail("End", end)
.detail("BoundaryBegin", beginIter->begin())
.detail("BoundaryEnd", beginIter->end());
throw special_keys_cross_module_read();
}
}
wait(normalizeKeySelectorActor(sks, ryw, &begin, moduleBoundary, &actualBeginOffset, &result, &cache));
wait(normalizeKeySelectorActor(sks, ryw, &end, moduleBoundary, &actualEndOffset, &result, &cache));
// Handle all corner cases like what RYW does
// return if range inverted
if (actualBeginOffset >= actualEndOffset && begin.getKey() >= end.getKey()) {
TEST(true); // inverted range
return RangeResultRef(false, false);
}
// If touches begin or end, return with readToBegin and readThroughEnd flags
if (begin.getKey() == moduleBoundary.end || end.getKey() == moduleBoundary.begin) {
TEST(true); // query touches begin or end
return result;
}
state RangeMap<Key, SpecialKeyRangeReadImpl*, KeyRangeRef>::Ranges ranges =
sks->getReadImpls().intersectingRanges(KeyRangeRef(begin.getKey(), end.getKey()));
// TODO : workaround to write this two together to make the code compact
// The issue here is boost::iterator_range<> doest not provide rbegin(), rend()
iter = reverse ? ranges.end() : ranges.begin();
if (reverse) {
while (iter != ranges.begin()) {
--iter;
if (iter->value() == nullptr)
continue;
KeyRangeRef kr = iter->range();
KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin;
KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
if (iter->value()->isAsync() && cache.present()) {
const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(iter->value());
RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits, &cache));
pairs = pairs_;
} else {
RangeResult pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits));
pairs = pairs_;
}
result.arena().dependsOn(pairs.arena());
// limits handler
for (int i = pairs.size() - 1; i >= 0; --i) {
ASSERT(iter->range().contains(pairs[i].key));
result.push_back(result.arena(), pairs[i]);
// Note : behavior here is even the last k-v pair makes total bytes larger than specified, it's still
// returned. In other words, the total size of the returned value (less the last entry) will be less
// than byteLimit
limits.decrement(pairs[i]);
if (limits.isReached()) {
result.more = true;
result.readToBegin = false;
return result;
};
}
}
} else {
for (iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (iter->value() == nullptr)
continue;
KeyRangeRef kr = iter->range();
KeyRef keyStart = kr.contains(begin.getKey()) ? begin.getKey() : kr.begin;
KeyRef keyEnd = kr.contains(end.getKey()) ? end.getKey() : kr.end;
if (iter->value()->isAsync() && cache.present()) {
const SpecialKeyRangeAsyncImpl* ptr = dynamic_cast<const SpecialKeyRangeAsyncImpl*>(iter->value());
RangeResult pairs_ = wait(ptr->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits, &cache));
pairs = pairs_;
} else {
RangeResult pairs_ = wait(iter->value()->getRange(ryw, KeyRangeRef(keyStart, keyEnd), limits));
pairs = pairs_;
}
result.arena().dependsOn(pairs.arena());
// limits handler
for (int i = 0; i < pairs.size(); ++i) {
ASSERT(iter->range().contains(pairs[i].key));
result.push_back(result.arena(), pairs[i]);
// Note : behavior here is even the last k-v pair makes total bytes larger than specified, it's still
// returned. In other words, the total size of the returned value (less the last entry) will be less
// than byteLimit
limits.decrement(pairs[i]);
if (limits.isReached()) {
result.more = true;
result.readThroughEnd = false;
return result;
};
}
}
}
return result;
}
Future<RangeResult> SpecialKeySpace::getRange(ReadYourWritesTransaction* ryw,
KeySelector begin,
KeySelector end,
GetRangeLimits limits,
Reverse reverse) {
// validate limits here
if (!limits.isValid())
return range_limits_invalid();
if (limits.isReached()) {
TEST(true); // read limit 0
return RangeResult();
}
// make sure orEqual == false
begin.removeOrEqual(begin.arena());
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
TEST(true); // range inverted
return RangeResult();
}
return checkRYWValid(this, ryw, begin, end, limits, reverse);
}
ACTOR Future<Optional<Value>> SpecialKeySpace::getActor(SpecialKeySpace* sks,
ReadYourWritesTransaction* ryw,
KeyRef key) {
// use getRange to workaround this
RangeResult result = wait(sks->getRange(ryw,
KeySelector(firstGreaterOrEqual(key)),
KeySelector(firstGreaterOrEqual(keyAfter(key))),
GetRangeLimits(CLIENT_KNOBS->TOO_MANY),
Reverse::False));
ASSERT(result.size() <= 1);
if (result.size()) {
return Optional<Value>(result[0].value);
} else {
return Optional<Value>();
}
}
Future<Optional<Value>> SpecialKeySpace::get(ReadYourWritesTransaction* ryw, const Key& key) {
return getActor(this, ryw, key);
}
void SpecialKeySpace::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
if (!ryw->specialKeySpaceChangeConfiguration())
throw special_keys_write_disabled();
auto impl = writeImpls[key];
if (impl == nullptr) {
TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound")
.detail("Key", key.toString())
.detail("Value", value.toString());
throw special_keys_no_write_module_found();
}
return impl->set(ryw, key, value);
}
void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
if (!ryw->specialKeySpaceChangeConfiguration())
throw special_keys_write_disabled();
if (range.empty())
return;
auto begin = writeImpls[range.begin];
auto end = writeImpls.rangeContainingKeyBefore(range.end)->value();
if (begin != end) {
TraceEvent(SevDebug, "SpecialKeySpaceCrossModuleClear").detail("Range", range);
throw special_keys_cross_module_clear(); // ban cross module clear
} else if (begin == nullptr) {
TraceEvent(SevDebug, "SpecialKeySpaceNoWriteModuleFound").detail("Range", range);
throw special_keys_no_write_module_found();
}
return begin->clear(ryw, range);
}
void SpecialKeySpace::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
if (!ryw->specialKeySpaceChangeConfiguration())
throw special_keys_write_disabled();
auto impl = writeImpls[key];
if (impl == nullptr)
throw special_keys_no_write_module_found();
return impl->clear(ryw, key);
}
bool validateSnakeCaseNaming(const KeyRef& k) {
KeyRef key(k);
// Remove prefix \xff\xff
ASSERT(key.startsWith(specialKeys.begin));
key = key.removePrefix(specialKeys.begin);
// Suffix can be \xff\xff or \x00 in single key range
if (key.endsWith(specialKeys.begin))
key = key.removeSuffix(specialKeys.end);
else if (key.endsWith(LiteralStringRef("\x00")))
key = key.removeSuffix(LiteralStringRef("\x00"));
for (const char& c : key.toString()) {
// only small letters, numbers, '/', '_' is allowed
ASSERT((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '/' || c == '_');
}
return true;
}
void SpecialKeySpace::registerKeyRange(SpecialKeySpace::MODULE module,
SpecialKeySpace::IMPLTYPE type,
const KeyRangeRef& kr,
SpecialKeyRangeReadImpl* impl) {
// module boundary check
if (module == SpecialKeySpace::MODULE::TESTONLY) {
ASSERT(normalKeys.contains(kr));
} else {
ASSERT(moduleToBoundary.at(module).contains(kr));
// validate keys follow snake case naming style
ASSERT(validateSnakeCaseNaming(kr.begin) && validateSnakeCaseNaming(kr.end));
}
// make sure the registered range is not overlapping with existing ones
// Note: kr.end should not be the same as another range's begin, although it should work even they are the same
for (auto iter = readImpls.rangeContaining(kr.begin); true; ++iter) {
ASSERT(iter->value() == nullptr);
if (iter == readImpls.rangeContaining(kr.end)) {
// Note: relax the condition that the end can be another range's start, if needed
break;
}
}
readImpls.insert(kr, impl);
// if rw, it means the module can do both read and write
if (type == SpecialKeySpace::IMPLTYPE::READWRITE) {
// since write impls are always subset of read impls,
// no need to check overlapped registration
auto rwImpl = dynamic_cast<SpecialKeyRangeRWImpl*>(impl);
ASSERT(rwImpl);
writeImpls.insert(kr, rwImpl);
}
}
Key SpecialKeySpace::decode(const KeyRef& key) {
auto impl = writeImpls[key];
ASSERT(impl != nullptr);
return impl->decode(key);
}
KeyRange SpecialKeySpace::decode(const KeyRangeRef& kr) {
// Only allow to decode key range in the same underlying impl range
auto begin = writeImpls.rangeContaining(kr.begin);
ASSERT(begin->value() != nullptr);
auto end = writeImpls.rangeContainingKeyBefore(kr.end);
ASSERT(begin == end);
return KeyRangeRef(begin->value()->decode(kr.begin), begin->value()->decode(kr.end));
}
ACTOR Future<Void> commitActor(SpecialKeySpace* sks, ReadYourWritesTransaction* ryw) {
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
state std::vector<SpecialKeyRangeRWImpl*> writeModulePtrs;
std::unordered_set<SpecialKeyRangeRWImpl*> deduplicate;
while (iter != ranges.end()) {
std::pair<bool, Optional<Value>> entry = iter->value();
if (entry.first) {
auto modulePtr = sks->getRWImpls().rangeContaining(iter->begin())->value();
auto [_, inserted] = deduplicate.insert(modulePtr);
if (inserted) {
writeModulePtrs.push_back(modulePtr);
}
}
++iter;
}
state std::vector<SpecialKeyRangeRWImpl*>::const_iterator it;
for (it = writeModulePtrs.begin(); it != writeModulePtrs.end(); ++it) {
Optional<std::string> msg = wait((*it)->commit(ryw));
if (msg.present()) {
ryw->setSpecialKeySpaceErrorMsg(msg.get());
TraceEvent(SevDebug, "SpecialKeySpaceManagementAPIError")
.detail("Reason", msg.get())
.detail("Range", (*it)->getKeyRange().toString());
throw special_keys_api_failure();
}
}
return Void();
}
Future<Void> SpecialKeySpace::commit(ReadYourWritesTransaction* ryw) {
return commitActor(this, ryw);
}
SKSCTestImpl::SKSCTestImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> SKSCTestImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ASSERT(range.contains(kr));
auto resultFuture = ryw->getRange(kr, CLIENT_KNOBS->TOO_MANY);
// all keys are written to RYW, since GRV is set, the read should happen locally
ASSERT(resultFuture.isReady());
auto result = resultFuture.getValue();
ASSERT(!result.more && result.size() < CLIENT_KNOBS->TOO_MANY);
auto kvs = resultFuture.getValue();
return rywGetRange(ryw, kr, kvs);
}
Future<Optional<std::string>> SKSCTestImpl::commit(ReadYourWritesTransaction* ryw) {
ASSERT(false);
return Optional<std::string>();
}
ReadConflictRangeImpl::ReadConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
ACTOR static Future<RangeResult> getReadConflictRangeImpl(ReadYourWritesTransaction* ryw, KeyRange kr) {
wait(ryw->pendingReads());
return ryw->getReadConflictRangeIntersecting(kr);
}
Future<RangeResult> ReadConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return getReadConflictRangeImpl(ryw, kr);
}
WriteConflictRangeImpl::WriteConflictRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
Future<RangeResult> WriteConflictRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return ryw->getWriteConflictRangeIntersecting(kr);
}
ConflictingKeysImpl::ConflictingKeysImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
Future<RangeResult> ConflictingKeysImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult result;
if (ryw->getTransactionState()->conflictingKeys) {
auto krMapPtr = ryw->getTransactionState()->conflictingKeys.get();
auto beginIter = krMapPtr->rangeContaining(kr.begin);
if (beginIter->begin() != kr.begin)
++beginIter;
auto endIter = krMapPtr->rangeContaining(kr.end);
for (auto it = beginIter; it != endIter; ++it) {
result.push_back_deep(result.arena(), KeyValueRef(it->begin(), it->value()));
}
if (endIter->begin() != kr.end)
result.push_back_deep(result.arena(), KeyValueRef(endIter->begin(), endIter->value()));
}
return result;
}
ACTOR Future<RangeResult> ddMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
loop {
try {
auto keys = kr.removePrefix(ddStatsRange.begin);
Standalone<VectorRef<DDMetricsRef>> resultWithoutPrefix = wait(
waitDataDistributionMetricsList(ryw->getDatabase(), keys, CLIENT_KNOBS->STORAGE_METRICS_SHARD_LIMIT));
RangeResult result;
for (const auto& ddMetricsRef : resultWithoutPrefix) {
// each begin key is the previous end key, thus we only encode the begin key in the result
KeyRef beginKey = ddMetricsRef.beginKey.withPrefix(ddStatsRange.begin, result.arena());
// Use json string encoded in utf-8 to encode the values, easy for adding more fields in the future
json_spirit::mObject statsObj;
statsObj["shard_bytes"] = ddMetricsRef.shardBytes;
std::string statsString =
json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8);
ValueRef bytes(result.arena(), statsString);
result.push_back(result.arena(), KeyValueRef(beginKey, bytes));
}
return result;
} catch (Error& e) {
state Error err(e);
if (e.code() == error_code_dd_not_found) {
TraceEvent(SevWarnAlways, "DataDistributorNotPresent")
.detail("Operation", "DDMetricsReqestThroughSpecialKeys");
wait(delayJittered(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
continue;
}
throw err;
}
}
}
DDStatsRangeImpl::DDStatsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {}
Future<RangeResult> DDStatsRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return ddMetricsGetRangeActor(ryw, kr);
}
Key SpecialKeySpace::getManagementApiCommandOptionSpecialKey(const std::string& command, const std::string& option) {
Key prefix = LiteralStringRef("options/").withPrefix(moduleToBoundary[MODULE::MANAGEMENT].begin);
auto pair = command + "/" + option;
ASSERT(options.find(pair) != options.end());
return prefix.withSuffix(pair);
}
ManagementCommandsOptionsImpl::ManagementCommandsOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ManagementCommandsOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult result;
// Since we only have limit number of options, a brute force loop here is enough
for (const auto& option : SpecialKeySpace::getManagementApiOptionsSet()) {
auto key = getKeyRange().begin.withSuffix(option);
// ignore all invalid keys
auto r = ryw->getSpecialKeySpaceWriteMap()[key];
if (kr.contains(key) && r.first && r.second.present()) {
result.push_back(result.arena(), KeyValueRef(key, ValueRef()));
result.arena().dependsOn(key.arena());
}
}
return result;
}
void ManagementCommandsOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
std::string option = key.removePrefix(getKeyRange().begin).toString();
// ignore all invalid keys
if (SpecialKeySpace::getManagementApiOptionsSet().find(option) !=
SpecialKeySpace::getManagementApiOptionsSet().end()) {
TraceEvent(SevDebug, "ManagementApiOption").detail("Option", option).detail("Key", key);
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
}
}
void ManagementCommandsOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
ryw->getSpecialKeySpaceWriteMap().rawErase(range);
}
void ManagementCommandsOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
std::string option = key.removePrefix(getKeyRange().begin).toString();
// ignore all invalid keys
if (SpecialKeySpace::getManagementApiOptionsSet().find(option) !=
SpecialKeySpace::getManagementApiOptionsSet().end()) {
ryw->getSpecialKeySpaceWriteMap().rawErase(singleKeyRange(key));
}
}
Future<Optional<std::string>> ManagementCommandsOptionsImpl::commit(ReadYourWritesTransaction* ryw) {
// Nothing to do, keys should be used by other impls' commit callback
return Optional<std::string>();
}
RangeResult rywGetRange(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr, const RangeResult& res) {
// "res" is the read result regardless of your writes, if ryw disabled, return immediately
if (ryw->readYourWritesDisabled())
return res;
// If ryw enabled, we update it with writes from the transaction
RangeResult result;
RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
auto iter2 = res.begin();
result.arena().dependsOn(res.arena());
while (iter != ranges.end() || iter2 != res.end()) {
if (iter == ranges.end()) {
result.push_back(result.arena(), KeyValueRef(iter2->key, iter2->value));
++iter2;
} else if (iter2 == res.end()) {
// insert if it is a set entry
std::pair<bool, Optional<Value>> entry = iter->value();
if (entry.first && entry.second.present()) {
result.push_back_deep(result.arena(), KeyValueRef(iter->begin(), entry.second.get()));
}
++iter;
} else if (iter->range().contains(iter2->key)) {
std::pair<bool, Optional<Value>> entry = iter->value();
// if this is a valid range either for set or clear, move iter2 outside the range
if (entry.first) {
// insert if this is a set entry
if (entry.second.present())
result.push_back_deep(result.arena(), KeyValueRef(iter->begin(), entry.second.get()));
// move iter2 outside the range
while (iter2 != res.end() && iter->range().contains(iter2->key))
++iter2;
}
++iter;
} else if (iter->begin() > iter2->key) {
result.push_back(result.arena(), KeyValueRef(iter2->key, iter2->value));
++iter2;
} else if (iter->end() <= iter2->key) {
// insert if it is a set entry
std::pair<bool, Optional<Value>> entry = iter->value();
if (entry.first && entry.second.present()) {
result.push_back_deep(result.arena(), KeyValueRef(iter->begin(), entry.second.get()));
}
++iter;
}
}
return result;
}
// read from those readwrite modules in which special keys have one-to-one mapping with real persisted keys
ACTOR Future<RangeResult> rwModuleWithMappingGetRangeActor(ReadYourWritesTransaction* ryw,
const SpecialKeyRangeRWImpl* impl,
KeyRangeRef kr) {
RangeResult resultWithoutPrefix =
wait(ryw->getTransaction().getRange(ryw->getDatabase()->specialKeySpace->decode(kr), CLIENT_KNOBS->TOO_MANY));
ASSERT(!resultWithoutPrefix.more && resultWithoutPrefix.size() < CLIENT_KNOBS->TOO_MANY);
RangeResult result;
for (const KeyValueRef& kv : resultWithoutPrefix)
result.push_back_deep(result.arena(), KeyValueRef(impl->encode(kv.key), kv.value));
return rywGetRange(ryw, kr, result);
}
ExcludeServersRangeImpl::ExcludeServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ExcludeServersRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
void ExcludeServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
// ignore value
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(ValueRef())));
}
Key ExcludeServersRangeImpl::decode(const KeyRef& key) const {
return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.withPrefix(LiteralStringRef("\xff/conf/"));
}
Key ExcludeServersRangeImpl::encode(const KeyRef& key) const {
return key.removePrefix(LiteralStringRef("\xff/conf/"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
}
bool parseNetWorkAddrFromKeys(ReadYourWritesTransaction* ryw,
bool failed,
std::vector<AddressExclusion>& addresses,
std::set<AddressExclusion>& exclusions,
Optional<std::string>& msg) {
KeyRangeRef range = failed ? SpecialKeySpace::getManagementApiCommandRange("failed")
: SpecialKeySpace::getManagementApiCommandRange("exclude");
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
auto iter = ranges.begin();
while (iter != ranges.end()) {
auto entry = iter->value();
// only check for exclude(set) operation, include(clear) are not checked
TraceEvent(SevDebug, "ParseNetworkAddress")
.detail("Valid", entry.first)
.detail("Set", entry.second.present())
.detail("Key", iter->begin().toString());
if (entry.first && entry.second.present()) {
Key address = iter->begin().removePrefix(range.begin);
auto a = AddressExclusion::parse(address);
if (!a.isValid()) {
std::string error = "ERROR: \'" + address.toString() + "\' is not a valid network endpoint address\n";
if (address.toString().find(":tls") != std::string::npos)
error += " Do not include the `:tls' suffix when naming a process\n";
msg = ManagementAPIError::toJsonString(
false, entry.second.present() ? (failed ? "exclude failed" : "exclude") : "include", error);
return false;
}
addresses.push_back(a);
exclusions.insert(a);
}
++iter;
}
return true;
}
ACTOR Future<bool> checkExclusion(Database db,
std::vector<AddressExclusion>* addresses,
std::set<AddressExclusion>* exclusions,
bool markFailed,
Optional<std::string>* msg) {
if (markFailed) {
state bool safe;
try {
bool _safe = wait(checkSafeExclusions(db, *addresses));
safe = _safe;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
TraceEvent("CheckSafeExclusionsError").error(e);
safe = false;
}
if (!safe) {
std::string temp = "ERROR: It is unsafe to exclude the specified servers at this time.\n"
"Please check that this exclusion does not bring down an entire storage team.\n"
"Please also ensure that the exclusion will keep a majority of coordinators alive.\n"
"You may add more storage processes or coordinators to make the operation safe.\n"
"Call set(\"0xff0xff/management/failed/<ADDRESS...>\", ...) to exclude without "
"performing safety checks.\n";
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", temp);
return false;
}
}
StatusObject status = wait(StatusClient::statusFetcher(db));
state std::string errorString =
"ERROR: Could not calculate the impact of this exclude on the total free space in the cluster.\n"
"Please try the exclude again in 30 seconds.\n"
"Call set(\"0xff0xff/management/options/exclude/force\", ...) first to exclude without checking free "
"space.\n";
StatusObjectReader statusObj(status);
StatusObjectReader statusObjCluster;
if (!statusObj.get("cluster", statusObjCluster)) {
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
StatusObjectReader processesMap;
if (!statusObjCluster.get("processes", processesMap)) {
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
state int ssTotalCount = 0;
state int ssExcludedCount = 0;
state double worstFreeSpaceRatio = 1.0;
try {
for (auto proc : processesMap.obj()) {
bool storageServer = false;
StatusArray rolesArray = proc.second.get_obj()["roles"].get_array();
for (StatusObjectReader role : rolesArray) {
if (role["role"].get_str() == "storage") {
storageServer = true;
break;
}
}
// Skip non-storage servers in free space calculation
if (!storageServer)
continue;
StatusObjectReader process(proc.second);
std::string addrStr;
if (!process.get("address", addrStr)) {
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
NetworkAddress addr = NetworkAddress::parse(addrStr);
bool excluded =
(process.has("excluded") && process.last().get_bool()) || addressExcluded(*exclusions, addr);
ssTotalCount++;
if (excluded)
ssExcludedCount++;
if (!excluded) {
StatusObjectReader disk;
if (!process.get("disk", disk)) {
*msg =
ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
int64_t total_bytes;
if (!disk.get("total_bytes", total_bytes)) {
*msg =
ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
int64_t free_bytes;
if (!disk.get("free_bytes", free_bytes)) {
*msg =
ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
worstFreeSpaceRatio = std::min(worstFreeSpaceRatio, double(free_bytes) / total_bytes);
}
}
} catch (...) // std::exception
{
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", errorString);
return false;
}
if (ssExcludedCount == ssTotalCount ||
(1 - worstFreeSpaceRatio) * ssTotalCount / (ssTotalCount - ssExcludedCount) > 0.9) {
std::string temp = "ERROR: This exclude may cause the total free space in the cluster to drop below 10%.\n"
"Call set(\"0xff0xff/management/options/exclude/force\", ...) first to exclude without "
"checking free space.\n";
*msg = ManagementAPIError::toJsonString(false, markFailed ? "exclude failed" : "exclude", temp);
return false;
}
return true;
}
void includeServers(ReadYourWritesTransaction* ryw) {
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// includeServers might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
ryw->setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY);
std::string versionKey = deterministicRandom()->randomUniqueID().toString();
// for exluded servers
auto ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(SpecialKeySpace::getManagementApiCommandRange("exclude"));
auto iter = ranges.begin();
Transaction& tr = ryw->getTransaction();
while (iter != ranges.end()) {
auto entry = iter->value();
if (entry.first && !entry.second.present()) {
tr.addReadConflictRange(singleKeyRange(excludedServersVersionKey));
tr.set(excludedServersVersionKey, versionKey);
tr.clear(ryw->getDatabase()->specialKeySpace->decode(iter->range()));
}
++iter;
}
// for failed servers
ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(SpecialKeySpace::getManagementApiCommandRange("failed"));
iter = ranges.begin();
while (iter != ranges.end()) {
auto entry = iter->value();
if (entry.first && !entry.second.present()) {
tr.addReadConflictRange(singleKeyRange(failedServersVersionKey));
tr.set(failedServersVersionKey, versionKey);
tr.clear(ryw->getDatabase()->specialKeySpace->decode(iter->range()));
}
++iter;
}
}
ACTOR Future<Optional<std::string>> excludeCommitActor(ReadYourWritesTransaction* ryw, bool failed) {
// parse network addresses
state Optional<std::string> result;
state std::vector<AddressExclusion> addresses;
state std::set<AddressExclusion> exclusions;
if (!parseNetWorkAddrFromKeys(ryw, failed, addresses, exclusions, result))
return result;
// If force option is not set, we need to do safety check
auto force = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey(
failed ? "failed" : "excluded", "force")];
// only do safety check when we have servers to be excluded and the force option key is not set
if (addresses.size() && !(force.first && force.second.present())) {
bool safe = wait(checkExclusion(ryw->getDatabase(), &addresses, &exclusions, failed, &result));
if (!safe)
return result;
}
excludeServers(ryw->getTransaction(), addresses, failed);
includeServers(ryw);
return result;
}
Future<Optional<std::string>> ExcludeServersRangeImpl::commit(ReadYourWritesTransaction* ryw) {
return excludeCommitActor(ryw, false);
}
FailedServersRangeImpl::FailedServersRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> FailedServersRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
void FailedServersRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
// ignore value
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(ValueRef())));
}
Key FailedServersRangeImpl::decode(const KeyRef& key) const {
return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.withPrefix(LiteralStringRef("\xff/conf/"));
}
Key FailedServersRangeImpl::encode(const KeyRef& key) const {
return key.removePrefix(LiteralStringRef("\xff/conf/"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
}
Future<Optional<std::string>> FailedServersRangeImpl::commit(ReadYourWritesTransaction* ryw) {
return excludeCommitActor(ryw, true);
}
ACTOR Future<RangeResult> ExclusionInProgressActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
state RangeResult result;
state Transaction& tr = ryw->getTransaction();
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); // necessary?
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state std::vector<AddressExclusion> excl = wait((getExcludedServers(&tr)));
state std::set<AddressExclusion> exclusions(excl.begin(), excl.end());
state std::set<NetworkAddress> inProgressExclusion;
// Just getting a consistent read version proves that a set of tlogs satisfying the exclusions has completed
// recovery Check that there aren't any storage servers with addresses violating the exclusions
state RangeResult serverList = wait(tr.getRange(serverListKeys, CLIENT_KNOBS->TOO_MANY));
ASSERT(!serverList.more && serverList.size() < CLIENT_KNOBS->TOO_MANY);
for (auto& s : serverList) {
auto addresses = decodeServerListValue(s.value).getKeyValues.getEndpoint().addresses;
if (addressExcluded(exclusions, addresses.address)) {
inProgressExclusion.insert(addresses.address);
}
if (addresses.secondaryAddress.present() && addressExcluded(exclusions, addresses.secondaryAddress.get())) {
inProgressExclusion.insert(addresses.secondaryAddress.get());
}
}
Optional<Standalone<StringRef>> value = wait(tr.get(logsKey));
ASSERT(value.present());
auto logs = decodeLogsValue(value.get());
for (auto const& log : logs.first) {
if (log.second == NetworkAddress() || addressExcluded(exclusions, log.second)) {
inProgressExclusion.insert(log.second);
}
}
for (auto const& log : logs.second) {
if (log.second == NetworkAddress() || addressExcluded(exclusions, log.second)) {
inProgressExclusion.insert(log.second);
}
}
// sort and remove :tls
std::set<std::string> inProgressAddresses;
for (auto const& address : inProgressExclusion) {
inProgressAddresses.insert(formatIpPort(address.ip, address.port));
}
for (auto const& address : inProgressAddresses) {
Key addrKey = prefix.withSuffix(address);
if (kr.contains(addrKey)) {
result.push_back(result.arena(), KeyValueRef(addrKey, ValueRef()));
result.arena().dependsOn(addrKey.arena());
}
}
return result;
}
ExclusionInProgressRangeImpl::ExclusionInProgressRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {}
Future<RangeResult> ExclusionInProgressRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return ExclusionInProgressActor(ryw, getKeyRange().begin, kr);
}
ACTOR Future<RangeResult> getProcessClassActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
std::vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
auto workers = _workers; // strip const
// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
std::sort(workers.begin(), workers.end(), [](const ProcessData& lhs, const ProcessData& rhs) {
return formatIpPort(lhs.address.ip, lhs.address.port) < formatIpPort(rhs.address.ip, rhs.address.port);
});
RangeResult result;
for (auto& w : workers) {
// exclude :tls in keys even the network addresss is TLS
KeyRef k(prefix.withSuffix(formatIpPort(w.address.ip, w.address.port), result.arena()));
if (kr.contains(k)) {
ValueRef v(result.arena(), w.processClass.toString());
result.push_back(result.arena(), KeyValueRef(k, v));
}
}
return rywGetRange(ryw, kr, result);
}
ACTOR Future<Optional<std::string>> processClassCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef range) {
// enable related options
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
std::vector<ProcessData> workers = wait(
getWorkers(&ryw->getTransaction())); // make sure we use the Transaction object to avoid used_during_commit()
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
auto iter = ranges.begin();
while (iter != ranges.end()) {
auto entry = iter->value();
// only loop through (set) operation, (clear) not exist
if (entry.first && entry.second.present()) {
// parse network address
Key address = iter->begin().removePrefix(range.begin);
AddressExclusion addr = AddressExclusion::parse(address);
// parse class type
ValueRef processClassType = entry.second.get();
ProcessClass processClass(processClassType.toString(), ProcessClass::DBSource);
// make sure we use the underlying Transaction object to avoid used_during_commit()
bool foundChange = false;
for (int i = 0; i < workers.size(); i++) {
if (addr.excludes(workers[i].address)) {
if (processClass.classType() != ProcessClass::InvalidClass)
ryw->getTransaction().set(processClassKeyFor(workers[i].locality.processId().get()),
processClassValue(processClass));
else
ryw->getTransaction().clear(processClassKeyFor(workers[i].locality.processId().get()));
foundChange = true;
}
}
if (foundChange)
ryw->getTransaction().set(processClassChangeKey, deterministicRandom()->randomUniqueID().toString());
}
++iter;
}
return Optional<std::string>();
}
ProcessClassRangeImpl::ProcessClassRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ProcessClassRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return getProcessClassActor(ryw, getKeyRange().begin, kr);
}
Future<Optional<std::string>> ProcessClassRangeImpl::commit(ReadYourWritesTransaction* ryw) {
// Validate network address and process class type
Optional<std::string> errorMsg;
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(getKeyRange());
auto iter = ranges.begin();
while (iter != ranges.end()) {
auto entry = iter->value();
// only check for setclass(set) operation, (clear) are forbidden thus not exist
if (entry.first && entry.second.present()) {
// validate network address
Key address = iter->begin().removePrefix(range.begin);
AddressExclusion addr = AddressExclusion::parse(address);
if (!addr.isValid()) {
std::string error = "ERROR: \'" + address.toString() + "\' is not a valid network endpoint address\n";
if (address.toString().find(":tls") != std::string::npos)
error += " Do not include the `:tls' suffix when naming a process\n";
errorMsg = ManagementAPIError::toJsonString(false, "setclass", error);
return errorMsg;
}
// validate class type
ValueRef processClassType = entry.second.get();
ProcessClass processClass(processClassType.toString(), ProcessClass::DBSource);
if (processClass.classType() == ProcessClass::InvalidClass &&
processClassType != LiteralStringRef("default")) {
std::string error = "ERROR: \'" + processClassType.toString() + "\' is not a valid process class\n";
errorMsg = ManagementAPIError::toJsonString(false, "setclass", error);
return errorMsg;
}
}
++iter;
}
return processClassCommitActor(ryw, getKeyRange());
}
void throwSpecialKeyApiFailure(ReadYourWritesTransaction* ryw, std::string command, std::string message) {
auto msg = ManagementAPIError::toJsonString(false, command, message);
ryw->setSpecialKeySpaceErrorMsg(msg);
throw special_keys_api_failure();
}
void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
return throwSpecialKeyApiFailure(ryw, "setclass", "Clear operation is meaningless thus forbidden for setclass");
}
void ProcessClassRangeImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
return throwSpecialKeyApiFailure(
ryw, "setclass", "Clear range operation is meaningless thus forbidden for setclass");
}
ACTOR Future<RangeResult> getProcessClassSourceActor(ReadYourWritesTransaction* ryw, KeyRef prefix, KeyRangeRef kr) {
std::vector<ProcessData> _workers = wait(getWorkers(&ryw->getTransaction()));
auto workers = _workers; // strip const
// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
std::sort(workers.begin(), workers.end(), [](const ProcessData& lhs, const ProcessData& rhs) {
return formatIpPort(lhs.address.ip, lhs.address.port) < formatIpPort(rhs.address.ip, rhs.address.port);
});
RangeResult result;
for (auto& w : workers) {
// exclude :tls in keys even the network addresss is TLS
Key k(prefix.withSuffix(formatIpPort(w.address.ip, w.address.port)));
if (kr.contains(k)) {
Value v(w.processClass.sourceString());
result.push_back(result.arena(), KeyValueRef(k, v));
result.arena().dependsOn(k.arena());
result.arena().dependsOn(v.arena());
}
}
return result;
}
ProcessClassSourceRangeImpl::ProcessClassSourceRangeImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
Future<RangeResult> ProcessClassSourceRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return getProcessClassSourceActor(ryw, getKeyRange().begin, kr);
}
ACTOR Future<RangeResult> getLockedKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
RangeResult result;
if (val.present()) {
UID uid = UID::fromString(BinaryReader::fromStringRef<UID>(val.get().substr(10), Unversioned()).toString());
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, Value(uid.toString())));
}
return result;
}
LockDatabaseImpl::LockDatabaseImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> LockDatabaseImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
// single key range, the queried range should always be the same as the underlying range
ASSERT(kr == getKeyRange());
auto lockEntry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("lock")];
if (!ryw->readYourWritesDisabled() && lockEntry.first) {
// ryw enabled and we have written to the special key
RangeResult result;
if (lockEntry.second.present()) {
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, lockEntry.second.get()));
}
return result;
} else {
return getLockedKeyActor(ryw, kr);
}
}
ACTOR Future<Optional<std::string>> lockDatabaseCommitActor(ReadYourWritesTransaction* ryw, UID uid) {
state Optional<std::string> msg;
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
if (val.present() && BinaryReader::fromStringRef<UID>(val.get().substr(10), Unversioned()) != uid) {
// check database not locked
// if locked already, throw error
throw database_locked();
} else if (!val.present()) {
// lock database
ryw->getTransaction().atomicOp(databaseLockedKey,
BinaryWriter::toValue(uid, Unversioned())
.withPrefix(LiteralStringRef("0123456789"))
.withSuffix(LiteralStringRef("\x00\x00\x00\x00")),
MutationRef::SetVersionstampedValue);
ryw->getTransaction().addWriteConflictRange(normalKeys);
}
return msg;
}
ACTOR Future<Optional<std::string>> unlockDatabaseCommitActor(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(databaseLockedKey));
if (val.present()) {
ryw->getTransaction().clear(singleKeyRange(databaseLockedKey));
}
return Optional<std::string>();
}
Future<Optional<std::string>> LockDatabaseImpl::commit(ReadYourWritesTransaction* ryw) {
auto lockId = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("lock")].second;
if (lockId.present()) {
std::string uidStr = lockId.get().toString();
UID uid;
try {
uid = UID::fromString(uidStr);
} catch (Error& e) {
return Optional<std::string>(
ManagementAPIError::toJsonString(false, "lock", "Invalid UID hex string: " + uidStr));
}
return lockDatabaseCommitActor(ryw, uid);
} else {
return unlockDatabaseCommitActor(ryw);
}
}
ACTOR Future<RangeResult> getConsistencyCheckKeyActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(ryw->getTransaction().get(fdbShouldConsistencyCheckBeSuspended));
bool ccSuspendSetting = val.present() ? BinaryReader::fromStringRef<bool>(val.get(), Unversioned()) : false;
RangeResult result;
if (ccSuspendSetting) {
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, ValueRef()));
}
return result;
}
ConsistencyCheckImpl::ConsistencyCheckImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ConsistencyCheckImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
// single key range, the queried range should always be the same as the underlying range
ASSERT(kr == getKeyRange());
auto entry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("consistencycheck")];
if (!ryw->readYourWritesDisabled() && entry.first) {
// ryw enabled and we have written to the special key
RangeResult result;
if (entry.second.present()) {
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, entry.second.get()));
}
return result;
} else {
return getConsistencyCheckKeyActor(ryw, kr);
}
}
Future<Optional<std::string>> ConsistencyCheckImpl::commit(ReadYourWritesTransaction* ryw) {
auto entry =
ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("consistencycheck")].second;
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->getTransaction().set(fdbShouldConsistencyCheckBeSuspended,
BinaryWriter::toValue(entry.present(), Unversioned()));
return Optional<std::string>();
}
GlobalConfigImpl::GlobalConfigImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Returns key-value pairs for each value stored in the global configuration
// framework within the range specified. The special-key-space getrange
// function should only be used for informational purposes. All values are
// returned as strings regardless of their true type.
Future<RangeResult> GlobalConfigImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult result;
auto& globalConfig = GlobalConfig::globalConfig();
KeyRangeRef modified =
KeyRangeRef(kr.begin.removePrefix(getKeyRange().begin), kr.end.removePrefix(getKeyRange().begin));
std::map<KeyRef, Reference<ConfigValue>> values = globalConfig.get(modified);
for (const auto& [key, config] : values) {
Key prefixedKey = key.withPrefix(getKeyRange().begin);
if (config.isValid() && config->value.has_value()) {
if (config->value.type() == typeid(StringRef)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::any_cast<StringRef>(config->value).toString()));
} else if (config->value.type() == typeid(int64_t)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<int64_t>(config->value))));
} else if (config->value.type() == typeid(bool)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<bool>(config->value))));
} else if (config->value.type() == typeid(float)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<float>(config->value))));
} else if (config->value.type() == typeid(double)) {
result.push_back_deep(result.arena(),
KeyValueRef(prefixedKey, std::to_string(std::any_cast<double>(config->value))));
} else {
ASSERT(false);
}
}
}
return result;
}
// Marks the key for insertion into global configuration.
void GlobalConfigImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
}
// Writes global configuration changes to durable memory. Also writes the
// changes made in the transaction to a recent history set, and updates the
// latest version which the global configuration was updated at.
ACTOR Future<Optional<std::string>> globalConfigCommitActor(GlobalConfigImpl* globalConfig,
ReadYourWritesTransaction* ryw) {
state Transaction& tr = ryw->getTransaction();
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// History should only contain three most recent updates. If it currently
// has three items, remove the oldest to make room for a new item.
RangeResult history = wait(tr.getRange(globalConfigHistoryKeys, CLIENT_KNOBS->TOO_MANY));
constexpr int kGlobalConfigMaxHistorySize = 3;
if (history.size() > kGlobalConfigMaxHistorySize - 1) {
for (int i = 0; i < history.size() - (kGlobalConfigMaxHistorySize - 1); ++i) {
tr.clear(history[i].key);
}
}
VersionHistory vh{ 0 };
// Transform writes from the special-key-space (\xff\xff/global_config/) to
// the system key space (\xff/globalConfig/), and writes mutations to
// latest version history.
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(specialKeys);
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::iterator iter = ranges.begin();
while (iter != ranges.end()) {
std::pair<bool, Optional<Value>> entry = iter->value();
if (entry.first) {
if (entry.second.present() && iter->begin().startsWith(globalConfig->getKeyRange().begin)) {
Key bareKey = iter->begin().removePrefix(globalConfig->getKeyRange().begin);
vh.mutations.emplace_back_deep(vh.mutations.arena(),
MutationRef(MutationRef::SetValue, bareKey, entry.second.get()));
Key systemKey = bareKey.withPrefix(globalConfigKeysPrefix);
tr.set(systemKey, entry.second.get());
} else if (!entry.second.present() && iter->range().begin.startsWith(globalConfig->getKeyRange().begin) &&
iter->range().end.startsWith(globalConfig->getKeyRange().begin)) {
KeyRef bareRangeBegin = iter->range().begin.removePrefix(globalConfig->getKeyRange().begin);
KeyRef bareRangeEnd = iter->range().end.removePrefix(globalConfig->getKeyRange().begin);
vh.mutations.emplace_back_deep(vh.mutations.arena(),
MutationRef(MutationRef::ClearRange, bareRangeBegin, bareRangeEnd));
Key systemRangeBegin = bareRangeBegin.withPrefix(globalConfigKeysPrefix);
Key systemRangeEnd = bareRangeEnd.withPrefix(globalConfigKeysPrefix);
tr.clear(KeyRangeRef(systemRangeBegin, systemRangeEnd));
}
}
++iter;
}
// Record the mutations in this commit into the global configuration history.
Key historyKey = addVersionStampAtEnd(globalConfigHistoryPrefix);
ObjectWriter historyWriter(IncludeVersion());
historyWriter.serialize(vh);
tr.atomicOp(historyKey, historyWriter.toStringRef(), MutationRef::SetVersionstampedKey);
// Write version key to trigger update in cluster controller.
tr.atomicOp(globalConfigVersionKey,
LiteralStringRef("0123456789\x00\x00\x00\x00"), // versionstamp
MutationRef::SetVersionstampedValue);
return Optional<std::string>();
}
// Called when a transaction includes keys in the global configuration special-key-space range.
Future<Optional<std::string>> GlobalConfigImpl::commit(ReadYourWritesTransaction* ryw) {
return globalConfigCommitActor(this, ryw);
}
// Marks the range for deletion from global configuration.
void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
ryw->getSpecialKeySpaceWriteMap().insert(range, std::make_pair(true, Optional<Value>()));
}
// Marks the key for deletion from global configuration.
void GlobalConfigImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
}
TracingOptionsImpl::TracingOptionsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> TracingOptionsImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult result;
for (const auto& option : SpecialKeySpace::getTracingOptions()) {
auto key = getKeyRange().begin.withSuffix(option);
if (!kr.contains(key)) {
continue;
}
if (key.endsWith(kTracingTransactionIdKey)) {
result.push_back_deep(result.arena(),
KeyValueRef(key, std::to_string(ryw->getTransactionState()->spanID.first())));
} else if (key.endsWith(kTracingTokenKey)) {
result.push_back_deep(result.arena(),
KeyValueRef(key, std::to_string(ryw->getTransactionState()->spanID.second())));
}
}
return result;
}
void TracingOptionsImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
if (ryw->getApproximateSize() > 0) {
ryw->setSpecialKeySpaceErrorMsg("tracing options must be set first");
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>()));
return;
}
if (key.endsWith(kTracingTransactionIdKey)) {
ryw->setTransactionID(std::stoul(value.toString()));
} else if (key.endsWith(kTracingTokenKey)) {
if (value.toString() == "true") {
ryw->setToken(deterministicRandom()->randomUInt64());
} else if (value.toString() == "false") {
ryw->setToken(0);
} else {
ryw->setSpecialKeySpaceErrorMsg("token must be set to true/false");
throw special_keys_api_failure();
}
}
}
Future<Optional<std::string>> TracingOptionsImpl::commit(ReadYourWritesTransaction* ryw) {
if (ryw->getSpecialKeySpaceWriteMap().size() > 0) {
throw special_keys_api_failure();
}
return Optional<std::string>();
}
void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
ryw->setSpecialKeySpaceErrorMsg("clear range disabled");
throw special_keys_api_failure();
}
void TracingOptionsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
ryw->setSpecialKeySpaceErrorMsg("clear disabled");
throw special_keys_api_failure();
}
CoordinatorsImpl::CoordinatorsImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> CoordinatorsImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult result;
KeyRef prefix(getKeyRange().begin);
auto cs = ryw->getDatabase()->getConnectionRecord()->getConnectionString();
auto coordinator_processes = cs.coordinators();
Key cluster_decription_key = prefix.withSuffix(LiteralStringRef("cluster_description"));
if (kr.contains(cluster_decription_key)) {
result.push_back_deep(result.arena(), KeyValueRef(cluster_decription_key, cs.clusterKeyName()));
}
// Note : the sort by string is anti intuition, ex. 1.1.1.1:11 < 1.1.1.1:5
// include :tls in keys if the network addresss is TLS
std::sort(coordinator_processes.begin(),
coordinator_processes.end(),
[](const NetworkAddress& lhs, const NetworkAddress& rhs) { return lhs.toString() < rhs.toString(); });
std::string processes_str;
for (const auto& w : coordinator_processes) {
if (processes_str.size())
processes_str += ",";
processes_str += w.toString();
}
Key processes_key = prefix.withSuffix(LiteralStringRef("processes"));
if (kr.contains(processes_key)) {
result.push_back_deep(result.arena(), KeyValueRef(processes_key, Value(processes_str)));
}
return rywGetRange(ryw, kr, result);
}
ACTOR static Future<Optional<std::string>> coordinatorsCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
state Reference<IQuorumChange> change;
state ClusterConnectionString
conn; // We don't care about the Key here, it will be overrode in changeQuorumChecker().
state std::vector<std::string> process_address_or_hostname_strs;
state Optional<std::string> msg;
state int index;
state bool parse_error = false;
// check update for coordinators
Key processes_key = LiteralStringRef("processes").withPrefix(kr.begin);
auto processes_entry = ryw->getSpecialKeySpaceWriteMap()[processes_key];
if (processes_entry.first) {
ASSERT(processes_entry.second.present()); // no clear should be seen here
auto processesStr = processes_entry.second.get().toString();
boost::split(process_address_or_hostname_strs, processesStr, [](char c) { return c == ','; });
if (!process_address_or_hostname_strs.size()) {
return ManagementAPIError::toJsonString(
false,
"coordinators",
"New coordinators\' processes are empty, please specify new processes\' network addresses with format "
"\"IP:PORT,IP:PORT,...,IP:PORT\" or \"HOSTNAME:PORT,HOSTNAME:PORT,...,HOSTNAME:PORT\"");
}
for (index = 0; index < process_address_or_hostname_strs.size(); index++) {
try {
if (Hostname::isHostname(process_address_or_hostname_strs[index])) {
conn.hostnames.push_back(Hostname::parse(process_address_or_hostname_strs[index]));
conn.status = ClusterConnectionString::ConnectionStringStatus::UNRESOLVED;
} else {
NetworkAddress a = NetworkAddress::parse(process_address_or_hostname_strs[index]);
if (!a.isValid()) {
parse_error = true;
} else {
conn.coords.push_back(a);
}
}
} catch (Error& e) {
TraceEvent(SevDebug, "SpecialKeysNetworkParseError").error(e);
parse_error = true;
}
if (parse_error) {
std::string error = "ERROR: \'" + process_address_or_hostname_strs[index] +
"\' is not a valid network endpoint address\n";
if (process_address_or_hostname_strs[index].find(":tls") != std::string::npos)
error += " Do not include the `:tls' suffix when naming a process\n";
return ManagementAPIError::toJsonString(false, "coordinators", error);
}
}
}
wait(conn.resolveHostnames());
if (conn.coordinators().size())
change = specifiedQuorumChange(conn.coordinators());
else
change = noQuorumChange();
// check update for cluster_description
Key cluster_decription_key = LiteralStringRef("cluster_description").withPrefix(kr.begin);
auto entry = ryw->getSpecialKeySpaceWriteMap()[cluster_decription_key];
if (entry.first) {
// check valid description [a-zA-Z0-9_]+
if (entry.second.present() && isAlphaNumeric(entry.second.get().toString())) {
// do the name change
change = nameQuorumChange(entry.second.get().toString(), change);
} else {
// throw the error
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "coordinators", "Cluster description must match [A-Za-z0-9_]+"));
}
}
ASSERT(change.isValid());
TraceEvent(SevDebug, "SKSChangeCoordinatorsStart")
.detail("NewHostnames", conn.hostnames.size() ? describe(conn.hostnames) : "N/A")
.detail("NewAddresses", describe(conn.coordinators()))
.detail("Description", entry.first ? entry.second.get().toString() : "");
Optional<CoordinatorsResult> r = wait(changeQuorumChecker(&ryw->getTransaction(), change, &conn));
TraceEvent(SevDebug, "SKSChangeCoordinatorsFinish")
.detail("Result", r.present() ? static_cast<int>(r.get()) : -1); // -1 means success
if (r.present()) {
auto res = r.get();
bool retriable = false;
if (res == CoordinatorsResult::COORDINATOR_UNREACHABLE) {
retriable = true;
} else if (res == CoordinatorsResult::SUCCESS) {
TraceEvent(SevError, "SpecialKeysForCoordinators").detail("UnexpectedSuccessfulResult", "");
ASSERT(false);
}
msg = ManagementAPIError::toJsonString(retriable, "coordinators", ManagementAPI::generateErrorMessage(res));
}
return msg;
}
Future<Optional<std::string>> CoordinatorsImpl::commit(ReadYourWritesTransaction* ryw) {
return coordinatorsCommitActor(ryw, getKeyRange());
}
void CoordinatorsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
return throwSpecialKeyApiFailure(ryw, "coordinators", "Clear range is meaningless thus forbidden for coordinators");
}
void CoordinatorsImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
return throwSpecialKeyApiFailure(
ryw, "coordinators", "Clear operation is meaningless thus forbidden for coordinators");
}
CoordinatorsAutoImpl::CoordinatorsAutoImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
ACTOR static Future<RangeResult> CoordinatorsAutoImplActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
state RangeResult res;
state std::string autoCoordinatorsKey;
state Transaction& tr = ryw->getTransaction();
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> currentKey = wait(tr.get(coordinatorsKey));
if (!currentKey.present()) {
ryw->setSpecialKeySpaceErrorMsg(
ManagementAPIError::toJsonString(false, "auto_coordinators", "The coordinator key does not exist"));
throw special_keys_api_failure();
}
state ClusterConnectionString old(currentKey.get().toString());
state CoordinatorsResult result = CoordinatorsResult::SUCCESS;
std::vector<NetworkAddress> _desiredCoordinators = wait(autoQuorumChange()->getDesiredCoordinators(
&tr,
old.coordinators(),
Reference<ClusterConnectionMemoryRecord>(new ClusterConnectionMemoryRecord(old)),
result));
if (result == CoordinatorsResult::NOT_ENOUGH_MACHINES) {
// we could get not_enough_machines if we happen to see the database while the cluster controller is updating
// the worker list, so make sure it happens twice before returning a failure
ryw->setSpecialKeySpaceErrorMsg(ManagementAPIError::toJsonString(
true,
"auto_coordinators",
"Too few fdbserver machines to provide coordination at the current redundancy level"));
throw special_keys_api_failure();
}
for (const auto& address : _desiredCoordinators) {
autoCoordinatorsKey += autoCoordinatorsKey.size() ? "," : "";
autoCoordinatorsKey += address.toString();
}
res.push_back_deep(res.arena(), KeyValueRef(kr.begin, Value(autoCoordinatorsKey)));
return res;
}
Future<RangeResult> CoordinatorsAutoImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
// single key range, the queried range should always be the same as the underlying range
ASSERT(kr == getKeyRange());
return CoordinatorsAutoImplActor(ryw, kr);
}
ACTOR static Future<RangeResult> getMinCommitVersionActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(minRequiredCommitVersionKey));
RangeResult result;
if (val.present()) {
Version minRequiredCommitVersion = BinaryReader::fromStringRef<Version>(val.get(), Unversioned());
ValueRef version(result.arena(), boost::lexical_cast<std::string>(minRequiredCommitVersion));
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, version));
}
return result;
}
AdvanceVersionImpl::AdvanceVersionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> AdvanceVersionImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
// single key range, the queried range should always be the same as the underlying range
ASSERT(kr == getKeyRange());
auto entry = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("advanceversion")];
if (!ryw->readYourWritesDisabled() && entry.first) {
// ryw enabled and we have written to the special key
RangeResult result;
if (entry.second.present()) {
result.push_back_deep(result.arena(), KeyValueRef(kr.begin, entry.second.get()));
}
return result;
} else {
return getMinCommitVersionActor(ryw, kr);
}
}
ACTOR static Future<Optional<std::string>> advanceVersionCommitActor(ReadYourWritesTransaction* ryw, Version v) {
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
TraceEvent(SevDebug, "AdvanceVersion").detail("MaxAllowedVersion", maxAllowedVerion);
if (v > maxAllowedVerion) {
return ManagementAPIError::toJsonString(
false,
"advanceversion",
"The given version is larger than the maximum allowed value(2**63-1-version_per_second*3600*24*365*1000)");
}
Version rv = wait(ryw->getTransaction().getReadVersion());
if (rv <= v) {
ryw->getTransaction().set(minRequiredCommitVersionKey, BinaryWriter::toValue(v + 1, Unversioned()));
} else {
return ManagementAPIError::toJsonString(
false, "advanceversion", "Current read version is larger than the given version");
}
return Optional<std::string>();
}
Future<Optional<std::string>> AdvanceVersionImpl::commit(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto minCommitVersion =
ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandPrefix("advanceversion")].second;
if (minCommitVersion.present()) {
try {
// Version is int64_t
Version v = boost::lexical_cast<int64_t>(minCommitVersion.get().toString());
return advanceVersionCommitActor(ryw, v);
} catch (boost::bad_lexical_cast& e) {
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "advanceversion", "Invalid version(int64_t) argument: " + minCommitVersion.get().toString()));
}
} else {
ryw->getTransaction().clear(minRequiredCommitVersionKey);
}
return Optional<std::string>();
}
ClientProfilingImpl::ClientProfilingImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
ACTOR static Future<RangeResult> ClientProfilingGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state RangeResult result;
// client_txn_sample_rate
state Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(prefix);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (kr.contains(sampleRateKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey];
if (!ryw->readYourWritesDisabled() && entry.first) {
// clear is forbidden
ASSERT(entry.second.present());
result.push_back_deep(result.arena(), KeyValueRef(sampleRateKey, entry.second.get()));
} else {
Optional<Value> f = wait(ryw->getTransaction().get(fdbClientInfoTxnSampleRate));
std::string sampleRateStr = "default";
if (f.present()) {
const double sampleRateDbl = BinaryReader::fromStringRef<double>(f.get(), Unversioned());
if (!std::isinf(sampleRateDbl)) {
sampleRateStr = boost::lexical_cast<std::string>(sampleRateDbl);
}
}
result.push_back_deep(result.arena(), KeyValueRef(sampleRateKey, Value(sampleRateStr)));
}
}
// client_txn_size_limit
state Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(prefix);
if (kr.contains(txnSizeLimitKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey];
if (!ryw->readYourWritesDisabled() && entry.first) {
// clear is forbidden
ASSERT(entry.second.present());
result.push_back_deep(result.arena(), KeyValueRef(txnSizeLimitKey, entry.second.get()));
} else {
Optional<Value> f = wait(ryw->getTransaction().get(fdbClientInfoTxnSizeLimit));
std::string sizeLimitStr = "default";
if (f.present()) {
const int64_t sizeLimit = BinaryReader::fromStringRef<int64_t>(f.get(), Unversioned());
if (sizeLimit != -1) {
sizeLimitStr = boost::lexical_cast<std::string>(sizeLimit);
}
}
result.push_back_deep(result.arena(), KeyValueRef(txnSizeLimitKey, Value(sizeLimitStr)));
}
}
return result;
}
// TODO : add limitation on set operation
Future<RangeResult> ClientProfilingImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return ClientProfilingGetRangeActor(ryw, getKeyRange().begin, kr);
}
Future<Optional<std::string>> ClientProfilingImpl::commit(ReadYourWritesTransaction* ryw) {
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
// client_txn_sample_rate
Key sampleRateKey = LiteralStringRef("client_txn_sample_rate").withPrefix(getKeyRange().begin);
auto rateEntry = ryw->getSpecialKeySpaceWriteMap()[sampleRateKey];
if (rateEntry.first && rateEntry.second.present()) {
std::string sampleRateStr = rateEntry.second.get().toString();
double sampleRate;
if (sampleRateStr == "default")
sampleRate = std::numeric_limits<double>::infinity();
else {
try {
sampleRate = boost::lexical_cast<double>(sampleRateStr);
} catch (boost::bad_lexical_cast& e) {
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "profile", "Invalid transaction sample rate(double): " + sampleRateStr));
}
}
ryw->getTransaction().set(fdbClientInfoTxnSampleRate, BinaryWriter::toValue(sampleRate, Unversioned()));
}
// client_txn_size_limit
Key txnSizeLimitKey = LiteralStringRef("client_txn_size_limit").withPrefix(getKeyRange().begin);
auto sizeLimitEntry = ryw->getSpecialKeySpaceWriteMap()[txnSizeLimitKey];
if (sizeLimitEntry.first && sizeLimitEntry.second.present()) {
std::string sizeLimitStr = sizeLimitEntry.second.get().toString();
int64_t sizeLimit;
if (sizeLimitStr == "default")
sizeLimit = -1;
else {
try {
sizeLimit = boost::lexical_cast<int64_t>(sizeLimitStr);
} catch (boost::bad_lexical_cast& e) {
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "profile", "Invalid transaction size limit(int64_t): " + sizeLimitStr));
}
}
ryw->getTransaction().set(fdbClientInfoTxnSizeLimit, BinaryWriter::toValue(sizeLimit, Unversioned()));
}
return Optional<std::string>();
}
void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& range) {
return throwSpecialKeyApiFailure(
ryw, "profile", "Clear range is forbidden for profile client. You can set it to default to disable profiling.");
}
void ClientProfilingImpl::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
return throwSpecialKeyApiFailure(
ryw,
"profile",
"Clear operation is forbidden for profile client. You can set it to default to disable profiling.");
}
ActorLineageImpl::ActorLineageImpl(KeyRangeRef kr) : SpecialKeyRangeReadImpl(kr) {}
void parse(StringRef& val, int& i) {
i = std::stoi(val.toString());
}
void parse(StringRef& val, double& d) {
d = std::stod(val.toString());
}
void parse(StringRef& val, WaitState& w) {
if (val == LiteralStringRef("disk") || val == LiteralStringRef("Disk")) {
w = WaitState::Disk;
} else if (val == LiteralStringRef("network") || val == LiteralStringRef("Network")) {
w = WaitState::Network;
} else if (val == LiteralStringRef("running") || val == LiteralStringRef("Running")) {
w = WaitState::Running;
} else {
throw std::range_error("failed to parse run state");
}
}
void parse(StringRef& val, time_t& t) {
struct tm tm;
#ifdef _WIN32
std::istringstream s(val.toString());
s.imbue(std::locale(setlocale(LC_TIME, nullptr)));
s >> std::get_time(&tm, "%FT%T%z");
if (s.fail()) {
throw std::invalid_argument("failed to parse ISO 8601 datetime");
}
long timezone;
if (_get_timezone(&timezone) != 0) {
throw std::runtime_error("failed to convert ISO 8601 datetime");
}
timezone = -timezone;
#else
if (strptime(val.toString().c_str(), "%FT%T%z", &tm) == nullptr) {
throw std::invalid_argument("failed to parse ISO 8601 datetime");
}
long timezone = tm.tm_gmtoff;
t = timegm(&tm);
if (t == -1) {
throw std::runtime_error("failed to convert ISO 8601 datetime");
}
t -= timezone;
#endif
}
void parse(StringRef& val, NetworkAddress& a) {
auto address = NetworkAddress::parse(val.toString());
if (!address.isValid()) {
throw std::invalid_argument("invalid host");
}
a = address;
}
// Base case function for parsing function below.
template <typename T>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1) {
if (it == end) {
return;
}
parse(*it, t1);
}
// Given an iterator into a vector of string tokens, an iterator to the end of
// the search space in the vector (exclusive), and a list of references to
// types, parses each token in the vector into the associated type according to
// the order of the arguments.
//
// For example, given the vector ["1", "1.5", "127.0.0.1:4000"] and the
// argument list int a, double b, NetworkAddress c, after this function returns
// each parameter passed in will hold the parsed value from the token list.
//
// The appropriate parsing function must be implemented for the type you wish
// to parse. See the existing parsing functions above, and add your own if
// necessary.
template <typename T, typename... Types>
void parse(std::vector<StringRef>::iterator it, std::vector<StringRef>::iterator end, T& t1, Types&... remaining) {
// Return as soon as all tokens have been parsed. This allows parameters
// passed at the end to act as optional parameters -- they will only be set
// if the value exists.
if (it == end) {
return;
}
try {
parse(*it, t1);
parse(++it, end, remaining...);
} catch (Error& e) {
throw e;
} catch (std::exception& e) {
throw e;
}
}
ACTOR static Future<RangeResult> actorLineageGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state RangeResult result;
// Set default values for all fields. The default will be used if the field
// is missing in the key.
state NetworkAddress host;
state WaitState waitStateStart = WaitState{ 0 };
state WaitState waitStateEnd = WaitState{ 2 };
state time_t timeStart = 0;
state time_t timeEnd = std::numeric_limits<time_t>::max();
state int seqStart = 0;
state int seqEnd = std::numeric_limits<int>::max();
state std::vector<StringRef> beginValues = kr.begin.removePrefix(prefix).splitAny("/"_sr);
state std::vector<StringRef> endValues = kr.end.removePrefix(prefix).splitAny("/"_sr);
// Require index (either "state" or "time") and address:port.
if (beginValues.size() < 2 || endValues.size() < 2) {
ryw->setSpecialKeySpaceErrorMsg("missing required parameters (index, host)");
throw special_keys_api_failure();
}
state NetworkAddress endRangeHost;
try {
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
// For the range \xff\xff/actor_lineage/state/ip:port/wait-state/time/seq
parse(beginValues.begin() + 1, beginValues.end(), host, waitStateStart, timeStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, waitStateEnd, timeEnd, seqEnd);
}
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
// For the range \xff\xff/actor_lineage/time/ip:port/time/wait-state/seq
parse(beginValues.begin() + 1, beginValues.end(), host, timeStart, waitStateStart, seqStart);
if (kr.begin != kr.end) {
parse(endValues.begin() + 1, endValues.end(), endRangeHost, timeEnd, waitStateEnd, seqEnd);
}
} else {
ryw->setSpecialKeySpaceErrorMsg("invalid index in actor_lineage");
throw special_keys_api_failure();
}
} catch (Error& e) {
if (e.code() != special_keys_api_failure().code()) {
ryw->setSpecialKeySpaceErrorMsg("failed to parse key");
throw special_keys_api_failure();
} else {
throw e;
}
}
if (kr.begin != kr.end && host != endRangeHost) {
// The client doesn't know about all the hosts, so a get range covering
// multiple hosts has no way of knowing which IP:port combos to use.
ryw->setSpecialKeySpaceErrorMsg("the host must remain the same on both ends of the range");
throw special_keys_api_failure();
}
// Open endpoint to target process on each call. This can be optimized at
// some point...
state ProcessInterface process;
process.getInterface = RequestStream<GetProcessInterfaceRequest>(Endpoint::wellKnown({ host }, WLTOKEN_PROCESS));
ProcessInterface p = wait(retryBrokenPromise(process.getInterface, GetProcessInterfaceRequest{}));
process = p;
ActorLineageRequest actorLineageRequest;
actorLineageRequest.waitStateStart = waitStateStart;
actorLineageRequest.waitStateEnd = waitStateEnd;
actorLineageRequest.timeStart = timeStart;
actorLineageRequest.timeEnd = timeEnd;
ActorLineageReply reply = wait(process.actorLineage.getReply(actorLineageRequest));
time_t dt = 0;
int seq = -1;
for (const auto& sample : reply.samples) {
time_t datetime = (time_t)sample.time;
char buf[50];
struct tm* tm;
tm = localtime(&datetime);
size_t size = strftime(buf, 50, "%FT%T%z", tm);
std::string date(buf, size);
seq = dt == datetime ? seq + 1 : 0;
dt = datetime;
for (const auto& [waitState, data] : sample.data) {
if (seq < seqStart) {
continue;
} else if (seq >= seqEnd) {
break;
}
std::ostringstream streamKey;
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
<< "/" << to_string(waitState) << "/" << date;
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
<< "/" << date << "/" << to_string(waitState);
} else {
ASSERT(false);
}
streamKey << "/" << seq;
msgpack::object_handle oh = msgpack::unpack(data.data(), data.size());
msgpack::object deserialized = oh.get();
std::ostringstream stream;
stream << deserialized;
result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), stream.str()));
}
if (sample.data.size() == 0) {
std::ostringstream streamKey;
if (SpecialKeySpace::getActorLineageApiCommandRange("state").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("state").toString() << host.toString()
<< "/Running/" << date;
} else if (SpecialKeySpace::getActorLineageApiCommandRange("time").contains(kr)) {
streamKey << SpecialKeySpace::getActorLineageApiCommandPrefix("time").toString() << host.toString()
<< "/" << date << "/Running";
} else {
ASSERT(false);
}
streamKey << "/" << seq;
result.push_back_deep(result.arena(), KeyValueRef(streamKey.str(), "{}"_sr));
}
}
return result;
}
Future<RangeResult> ActorLineageImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return actorLineageGetRangeActor(ryw, getKeyRange().begin, kr);
}
namespace {
std::string_view to_string_view(StringRef sr) {
return std::string_view(reinterpret_cast<const char*>(sr.begin()), sr.size());
}
} // namespace
ActorProfilerConf::ActorProfilerConf(KeyRangeRef kr)
: SpecialKeyRangeRWImpl(kr), config(ProfilerConfig::instance().getConfig()) {}
Future<RangeResult> ActorProfilerConf::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
RangeResult res;
std::string_view begin(to_string_view(kr.begin.removePrefix(range.begin))),
end(to_string_view(kr.end.removePrefix(range.begin)));
for (auto& p : config) {
if (p.first > end) {
break;
} else if (p.first > begin) {
KeyValueRef kv;
kv.key = StringRef(res.arena(), p.first).withPrefix(kr.begin, res.arena());
kv.value = StringRef(res.arena(), p.second);
res.push_back(res.arena(), kv);
}
}
return res;
}
void ActorProfilerConf::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
config[key.removePrefix(range.begin).toString()] = value.toString();
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(value)));
didWrite = true;
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRangeRef& kr) {
std::string begin(kr.begin.removePrefix(range.begin).toString()), end(kr.end.removePrefix(range.begin).toString());
auto first = config.lower_bound(begin);
if (first == config.end()) {
// nothing to clear
return;
}
didWrite = true;
auto last = config.upper_bound(end);
config.erase(first, last);
}
void ActorProfilerConf::clear(ReadYourWritesTransaction* ryw, const KeyRef& key) {
std::string k = key.removePrefix(range.begin).toString();
auto iter = config.find(k);
if (iter != config.end()) {
config.erase(iter);
}
didWrite = true;
}
Future<Optional<std::string>> ActorProfilerConf::commit(ReadYourWritesTransaction* ryw) {
Optional<std::string> res{};
try {
if (didWrite) {
ProfilerConfig::instance().reset(config);
}
return res;
} catch (ConfigError& err) {
return Optional<std::string>{ err.description };
}
}
MaintenanceImpl::MaintenanceImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Used to read the healthZoneKey
// If the key is persisted and the delayed read version is still larger than current read version,
// we will calculate the remaining time(truncated to integer, the same as fdbcli) and return back as the value
// If the zoneId is the special one `ignoreSSFailuresZoneString`,
// value will be 0 (same as fdbcli)
ACTOR static Future<RangeResult> MaintenanceGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state RangeResult result;
// zoneId
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
if (val.present()) {
auto healthyZone = decodeHealthyZoneValue(val.get());
if ((healthyZone.first == ignoreSSFailuresZoneString) ||
(healthyZone.second > ryw->getTransaction().getReadVersion().get())) {
Key zone_key = healthyZone.first.withPrefix(prefix);
double seconds = healthyZone.first == ignoreSSFailuresZoneString
? 0
: (healthyZone.second - ryw->getTransaction().getReadVersion().get()) /
CLIENT_KNOBS->CORE_VERSIONSPERSECOND;
if (kr.contains(zone_key)) {
result.push_back_deep(result.arena(),
KeyValueRef(zone_key, Value(boost::lexical_cast<std::string>(seconds))));
}
}
}
return rywGetRange(ryw, kr, result);
}
Future<RangeResult> MaintenanceImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return MaintenanceGetRangeActor(ryw, getKeyRange().begin, kr);
}
// Commit the change to healthZoneKey
// We do not allow more than one zone to be set in maintenance in one transaction
// In addition, if the zoneId now is 'ignoreSSFailuresZoneString',
// which means the data distribution is disabled for storage failures.
// Only clear this specific key is allowed, any other operations will throw error
ACTOR static Future<Optional<std::string>> maintenanceCommitActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) {
// read
ryw->getTransaction().setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->getTransaction().setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> val = wait(ryw->getTransaction().get(healthyZoneKey));
Optional<std::pair<Key, Version>> healthyZone =
val.present() ? decodeHealthyZoneValue(val.get()) : Optional<std::pair<Key, Version>>();
state RangeMap<Key, std::pair<bool, Optional<Value>>, KeyRangeRef>::Ranges ranges =
ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
Key zoneId;
double seconds;
bool isSet = false;
// Since maintenance only allows one zone at the same time,
// if a transaction has more than one set operation on different zone keys,
// the commit will throw an error
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (!iter->value().first)
continue;
if (iter->value().second.present()) {
if (isSet)
return Optional<std::string>(ManagementAPIError::toJsonString(
false, "maintenance", "Multiple zones given for maintenance, only one allowed at the same time"));
isSet = true;
zoneId = iter->begin().removePrefix(kr.begin);
seconds = boost::lexical_cast<double>(iter->value().second.get().toString());
} else {
// if we already have set operation, then all clear operations will be meaningless, thus skip
if (!isSet && healthyZone.present() && iter.range().contains(healthyZone.get().first.withPrefix(kr.begin)))
ryw->getTransaction().clear(healthyZoneKey);
}
}
if (isSet) {
if (healthyZone.present() && healthyZone.get().first == ignoreSSFailuresZoneString) {
std::string msg = "Maintenance mode cannot be used while data distribution is disabled for storage "
"server failures.";
return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
} else if (seconds < 0) {
std::string msg =
"The specified maintenance time " + boost::lexical_cast<std::string>(seconds) + " is a negative value";
return Optional<std::string>(ManagementAPIError::toJsonString(false, "maintenance", msg));
} else {
TraceEvent(SevDebug, "SKSMaintenanceSet").detail("ZoneId", zoneId.toString());
ryw->getTransaction().set(healthyZoneKey,
healthyZoneValue(zoneId,
ryw->getTransaction().getReadVersion().get() +
(seconds * CLIENT_KNOBS->CORE_VERSIONSPERSECOND)));
}
}
return Optional<std::string>();
}
Future<Optional<std::string>> MaintenanceImpl::commit(ReadYourWritesTransaction* ryw) {
return maintenanceCommitActor(ryw, getKeyRange());
}
DataDistributionImpl::DataDistributionImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
// Read the system keys dataDistributionModeKey and rebalanceDDIgnoreKey
ACTOR static Future<RangeResult> DataDistributionGetRangeActor(ReadYourWritesTransaction* ryw,
KeyRef prefix,
KeyRangeRef kr) {
state RangeResult result;
// dataDistributionModeKey
state Key modeKey = LiteralStringRef("mode").withPrefix(prefix);
ryw->getTransaction().setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
if (kr.contains(modeKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[modeKey];
if (ryw->readYourWritesDisabled() || !entry.first) {
Optional<Value> f = wait(ryw->getTransaction().get(dataDistributionModeKey));
int mode = -1;
if (f.present()) {
mode = BinaryReader::fromStringRef<int>(f.get(), Unversioned());
}
result.push_back_deep(result.arena(), KeyValueRef(modeKey, Value(boost::lexical_cast<std::string>(mode))));
}
}
// rebalanceDDIgnoreKey
state Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(prefix);
if (kr.contains(rebalanceIgnoredKey)) {
auto entry = ryw->getSpecialKeySpaceWriteMap()[rebalanceIgnoredKey];
if (ryw->readYourWritesDisabled() || !entry.first) {
Optional<Value> f = wait(ryw->getTransaction().get(rebalanceDDIgnoreKey));
if (f.present()) {
result.push_back_deep(result.arena(), KeyValueRef(rebalanceIgnoredKey, Value()));
}
}
}
return rywGetRange(ryw, kr, result);
}
Future<RangeResult> DataDistributionImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
return DataDistributionGetRangeActor(ryw, getKeyRange().begin, kr);
}
Future<Optional<std::string>> DataDistributionImpl::commit(ReadYourWritesTransaction* ryw) {
// there are two valid keys in the range
// <prefix>/mode -> dataDistributionModeKey, the value is only allowed to be set as "0"(disable) or "1"(enable)
// <prefix>/rebalance_ignored -> rebalanceDDIgnoreKey, value is unused thus empty
ryw->getTransaction().setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
Optional<std::string> msg;
KeyRangeRef kr = getKeyRange();
Key modeKey = LiteralStringRef("mode").withPrefix(kr.begin);
Key rebalanceIgnoredKey = LiteralStringRef("rebalance_ignored").withPrefix(kr.begin);
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(kr);
for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) {
if (!iter->value().first)
continue;
if (iter->value().second.present()) {
if (iter->range() == singleKeyRange(modeKey)) {
try {
int mode = boost::lexical_cast<int>(iter->value().second.get().toString());
Value modeVal = BinaryWriter::toValue(mode, Unversioned());
if (mode == 0 || mode == 1) {
// Whenever configuration changes or DD related system keyspace is changed,
// actor must grab the moveKeysLockOwnerKey and update moveKeysLockWriteKey.
// This prevents concurrent write to the same system keyspace.
// When the owner of the DD related system keyspace changes, DD will reboot
BinaryWriter wrMyOwner(Unversioned());
wrMyOwner << dataDistributionModeLock;
ryw->getTransaction().set(moveKeysLockOwnerKey, wrMyOwner.toValue());
BinaryWriter wrLastWrite(Unversioned());
wrLastWrite << deterministicRandom()->randomUniqueID();
ryw->getTransaction().set(moveKeysLockWriteKey, wrLastWrite.toValue());
// set mode
ryw->getTransaction().set(dataDistributionModeKey, modeVal);
} else
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Please set the value of the data_distribution/mode to "
"0(disable) or 1(enable), other values are not allowed");
} catch (boost::bad_lexical_cast& e) {
msg = ManagementAPIError::toJsonString(false,
"datadistribution",
"Invalid datadistribution mode(int): " +
iter->value().second.get().toString());
}
} else if (iter->range() == singleKeyRange(rebalanceIgnoredKey)) {
if (iter->value().second.get().size())
msg =
ManagementAPIError::toJsonString(false,
"datadistribution",
"Value is unused for the data_distribution/rebalance_ignored "
"key, please set it to an empty value");
else
ryw->getTransaction().set(rebalanceDDIgnoreKey, LiteralStringRef("on"));
} else {
msg = ManagementAPIError::toJsonString(
false,
"datadistribution",
"Changing invalid keys, please read the documentation to check valid keys in the range");
}
} else {
// clear
if (iter->range().contains(modeKey))
ryw->getTransaction().clear(dataDistributionModeKey);
else if (iter->range().contains(rebalanceIgnoredKey))
ryw->getTransaction().clear(rebalanceDDIgnoreKey);
}
}
return msg;
}
// Clears the special management api keys excludeLocality and failedLocality.
void includeLocalities(ReadYourWritesTransaction* ryw) {
ryw->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
ryw->setOption(FDBTransactionOptions::LOCK_AWARE);
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
ryw->setOption(FDBTransactionOptions::USE_PROVISIONAL_PROXIES);
// includeLocalities might be used in an emergency transaction, so make sure it is retry-self-conflicting and
// CAUSAL_WRITE_RISKY
ryw->setOption(FDBTransactionOptions::CAUSAL_WRITE_RISKY);
std::string versionKey = deterministicRandom()->randomUniqueID().toString();
// for excluded localities
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(
SpecialKeySpace::getManagementApiCommandRange("excludedlocality"));
Transaction& tr = ryw->getTransaction();
for (auto& iter : ranges) {
auto entry = iter.value();
if (entry.first && !entry.second.present()) {
tr.addReadConflictRange(singleKeyRange(excludedLocalityVersionKey));
tr.set(excludedLocalityVersionKey, versionKey);
tr.clear(ryw->getDatabase()->specialKeySpace->decode(iter.range()));
}
}
// for failed localities
ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(
SpecialKeySpace::getManagementApiCommandRange("failedlocality"));
for (auto& iter : ranges) {
auto entry = iter.value();
if (entry.first && !entry.second.present()) {
tr.addReadConflictRange(singleKeyRange(failedLocalityVersionKey));
tr.set(failedLocalityVersionKey, versionKey);
tr.clear(ryw->getDatabase()->specialKeySpace->decode(iter.range()));
}
}
}
// Reads the excludedlocality and failed locality keys using managment api,
// parses them and returns the list.
bool parseLocalitiesFromKeys(ReadYourWritesTransaction* ryw,
bool failed,
std::unordered_set<std::string>& localities,
std::vector<AddressExclusion>& addresses,
std::set<AddressExclusion>& exclusions,
std::vector<ProcessData>& workers,
Optional<std::string>& msg) {
KeyRangeRef range = failed ? SpecialKeySpace::getManagementApiCommandRange("failedlocality")
: SpecialKeySpace::getManagementApiCommandRange("excludedlocality");
auto ranges = ryw->getSpecialKeySpaceWriteMap().containedRanges(range);
auto iter = ranges.begin();
while (iter != ranges.end()) {
auto entry = iter->value();
// only check for exclude(set) operation, include(clear) are not checked
TraceEvent(SevDebug, "ParseLocalities")
.detail("Valid", entry.first)
.detail("Set", entry.second.present())
.detail("Key", iter->begin().toString());
if (entry.first && entry.second.present()) {
Key locality = iter->begin().removePrefix(range.begin);
if (locality.startsWith(LocalityData::ExcludeLocalityPrefix) &&
locality.toString().find(':') != std::string::npos) {
std::set<AddressExclusion> localityAddresses = getAddressesByLocality(workers, locality.toString());
if (!localityAddresses.empty()) {
std::copy(localityAddresses.begin(), localityAddresses.end(), back_inserter(addresses));
exclusions.insert(localityAddresses.begin(), localityAddresses.end());
}
localities.insert(locality.toString());
} else {
std::string error = "ERROR: \'" + locality.toString() + "\' is not a valid locality\n";
msg = ManagementAPIError::toJsonString(
false, entry.second.present() ? (failed ? "exclude failed" : "exclude") : "include", error);
return false;
}
}
++iter;
}
return true;
}
// On commit, parses the special exclusion keys and get the localities to be excluded, check for exclusions
// and add them to the exclusion list. Also, clears the special management api keys with includeLocalities.
ACTOR Future<Optional<std::string>> excludeLocalityCommitActor(ReadYourWritesTransaction* ryw, bool failed) {
state Optional<std::string> result;
state std::unordered_set<std::string> localities;
state std::vector<AddressExclusion> addresses;
state std::set<AddressExclusion> exclusions;
ryw->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state std::vector<ProcessData> workers = wait(getWorkers(&ryw->getTransaction()));
if (!parseLocalitiesFromKeys(ryw, failed, localities, addresses, exclusions, workers, result))
return result;
// If force option is not set, we need to do safety check
auto force = ryw->getSpecialKeySpaceWriteMap()[SpecialKeySpace::getManagementApiCommandOptionSpecialKey(
failed ? "failed_locality" : "excluded_locality", "force")];
// only do safety check when we have localities to be excluded and the force option key is not set
if (localities.size() && !(force.first && force.second.present())) {
bool safe = wait(checkExclusion(ryw->getDatabase(), &addresses, &exclusions, failed, &result));
if (!safe)
return result;
}
excludeLocalities(ryw->getTransaction(), localities, failed);
includeLocalities(ryw);
return result;
}
ExcludedLocalitiesRangeImpl::ExcludedLocalitiesRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> ExcludedLocalitiesRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
void ExcludedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
// ignore value
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(ValueRef())));
}
Key ExcludedLocalitiesRangeImpl::decode(const KeyRef& key) const {
return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.withPrefix(LiteralStringRef("\xff/conf/"));
}
Key ExcludedLocalitiesRangeImpl::encode(const KeyRef& key) const {
return key.removePrefix(LiteralStringRef("\xff/conf/"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
}
Future<Optional<std::string>> ExcludedLocalitiesRangeImpl::commit(ReadYourWritesTransaction* ryw) {
// exclude locality with failed option as false.
return excludeLocalityCommitActor(ryw, false);
}
FailedLocalitiesRangeImpl::FailedLocalitiesRangeImpl(KeyRangeRef kr) : SpecialKeyRangeRWImpl(kr) {}
Future<RangeResult> FailedLocalitiesRangeImpl::getRange(ReadYourWritesTransaction* ryw,
KeyRangeRef kr,
GetRangeLimits limitsHint) const {
ryw->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return rwModuleWithMappingGetRangeActor(ryw, this, kr);
}
void FailedLocalitiesRangeImpl::set(ReadYourWritesTransaction* ryw, const KeyRef& key, const ValueRef& value) {
// ignore value
ryw->getSpecialKeySpaceWriteMap().insert(key, std::make_pair(true, Optional<Value>(ValueRef())));
}
Key FailedLocalitiesRangeImpl::decode(const KeyRef& key) const {
return key.removePrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)
.withPrefix(LiteralStringRef("\xff/conf/"));
}
Key FailedLocalitiesRangeImpl::encode(const KeyRef& key) const {
return key.removePrefix(LiteralStringRef("\xff/conf/"))
.withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin);
}
Future<Optional<std::string>> FailedLocalitiesRangeImpl::commit(ReadYourWritesTransaction* ryw) {
// exclude locality with failed option as true.
return excludeLocalityCommitActor(ryw, true);
}