Merge branch 'main' of github.com:apple/foundationdb into commitproxies

This commit is contained in:
Ankita Kejriwal 2022-11-03 18:09:29 -07:00
commit 27246dd4e6
171 changed files with 4730 additions and 1884 deletions

View File

@ -1,5 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401, E711
ignore = E203, E266, E501, W503, F403, F401, E711, C901
max-line-length = 79
max-complexity = 18
select = B,C,E,F,W,T4,B9

View File

@ -577,7 +577,7 @@ class ApiTest(Test):
key1, key2 = key2, key1
# TODO: randomize chunkSize but should not exceed 100M(shard limit)
chunkSize = 10000000 # 10M
chunkSize = 10000000 # 10M
instructions.push_args(key1, key2, chunkSize)
instructions.append(op)
self.add_strings(1)

View File

@ -114,7 +114,7 @@ class DirectoryTest(Test):
instructions.push_args(layer)
instructions.push_args(*test_util.with_length(path))
instructions.append('DIRECTORY_OPEN')
self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer==b'partition'))))
self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer == b'partition'))))
# print('%d. Selected %s, dir=%s, dir_id=%s, has_known_prefix=%s, dir_list_len=%d' \
# % (len(instructions), 'DIRECTORY_OPEN', repr(self.dir_index), self.dir_list[-1].dir_id, False, len(self.dir_list)-1))
@ -163,8 +163,8 @@ class DirectoryTest(Test):
elif root_op == 'DIRECTORY_CREATE_LAYER':
indices = []
prefixes = [generate_prefix(require_unique=args.concurrency==1, is_partition=True) for i in range(2)]
prefixes = [generate_prefix(require_unique=args.concurrency == 1, is_partition=True) for i in range(2)]
for i in range(2):
instructions.push_args(prefixes[i])
instructions.push_args(*test_util.with_length(generate_path()))
@ -184,9 +184,9 @@ class DirectoryTest(Test):
test_util.blocking_commit(instructions)
path = generate_path()
# Partitions that use the high-contention allocator can result in non-determinism if they fail to commit,
# Partitions that use the high-contention allocator can result in non-determinism if they fail to commit,
# so we disallow them in comparison tests
op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency>1),)
op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency > 1),)
directory_util.push_instruction_and_record_prefix(instructions, op, op_args, path, len(self.dir_list), self.random, self.prefix_log)
if not op.endswith('_DATABASE') and args.concurrency == 1:
@ -196,14 +196,14 @@ class DirectoryTest(Test):
if child_entry is None:
child_entry = DirectoryStateTreeNode(True, True)
child_entry.state.has_known_prefix = False
child_entry.state.has_known_prefix = False
self.dir_list.append(dir_entry.add_child(path, child_entry))
elif root_op == 'DIRECTORY_CREATE':
layer = self.generate_layer()
is_partition = layer == b'partition'
prefix = generate_prefix(require_unique=is_partition and args.concurrency==1, is_partition=is_partition, min_length=0)
prefix = generate_prefix(require_unique=is_partition and args.concurrency == 1, is_partition=is_partition, min_length=0)
# Because allocated prefixes are non-deterministic, we cannot have overlapping
# transactions that allocate/remove these prefixes in a comparison test
@ -409,7 +409,7 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
if require_unique:
min_length = max(min_length, 16)
length = random.randint(min_length, min_length+5)
length = random.randint(min_length, min_length + 5)
if length == 0:
return b''
@ -419,6 +419,6 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
else:
return bytes([random.randrange(ord('\x02'), ord('\x14')) for i in range(0, length)])
else:
prefix = fixed_prefix
prefix = fixed_prefix
generated = prefix[0:random.randrange(min_length, len(prefix))]
return generated

View File

@ -1,5 +1,6 @@
import sys
class TreeNodeState:
def __init__(self, node, dir_id, is_directory, is_subspace, has_known_prefix, root, is_partition):
self.dir_id = dir_id
@ -9,10 +10,11 @@ class TreeNodeState:
self.root = root
self.is_partition = is_partition
self.parents = { node }
self.parents = {node}
self.children = {}
self.deleted = False
# Represents an element of the directory hierarchy. As a result of various operations (e.g. moves) that
# may or may not have succeeded, a node can represent multiple possible states.
class DirectoryStateTreeNode:
@ -25,7 +27,7 @@ class DirectoryStateTreeNode:
default_directory = None
# Used for debugging
dir_id = 0
dir_id = 0
@classmethod
def reset(cls):
@ -62,7 +64,7 @@ class DirectoryStateTreeNode:
if default is not None:
default_child = default.state.children.get(subpath[0])
self_child = self.state.children.get(subpath[0])
self_child = self.state.children.get(subpath[0])
if self_child is None:
if default_child is None:
@ -143,13 +145,15 @@ class DirectoryStateTreeNode:
child = self.get_descendent(path)
if child:
child._delete_impl()
def validate_dir(dir, root):
if dir.state.is_directory:
assert dir.state.root == root
else:
assert dir.state.root == dir
def run_test():
all_entries = []
@ -249,11 +253,11 @@ def run_test():
# Test moving an entry
assert not entry.state.has_known_prefix
assert not entry.state.is_subspace
assert list(entry.state.children.keys()) == ['1']
assert list(entry.state.children.keys()) == ['1']
for e in all_entries:
validate_dir(e, root)
if __name__ == '__main__':
sys.exit(run_test())

View File

@ -18,7 +18,6 @@
# limitations under the License.
#
import random
import struct
import fdb
@ -35,6 +34,7 @@ DEFAULT_DIRECTORY_INDEX = 4
DEFAULT_DIRECTORY_PREFIX = b'default'
DIRECTORY_ERROR_STRING = b'DIRECTORY_ERROR'
def setup_directories(instructions, default_path, random):
# Clients start with the default directory layer in the directory list
DirectoryStateTreeNode.reset()

View File

@ -107,7 +107,7 @@ class RandomGenerator(object):
user_version = random.randint(0, 0xffff)
tup.append(fdb.tuple.Versionstamp(tr_version, user_version))
else:
assert false
assert False
return tuple(tup)

View File

@ -31,6 +31,7 @@ from bindingtester.tests import test_util
fdb.api_version(FDB_API_VERSION)
class TupleTest(Test):
def __init__(self, subspace):
super(TupleTest, self).__init__(subspace)
@ -44,14 +45,14 @@ class TupleTest(Test):
def generate(self, args, thread_number):
instructions = InstructionSet()
min_value = -2**self.max_int_bits+1
max_value = 2**self.max_int_bits-1
min_value = -2**self.max_int_bits + 1
max_value = 2**self.max_int_bits - 1
instructions.append('NEW_TRANSACTION')
# Test integer encoding
mutations = 0
for i in range(0, self.max_int_bits+1):
for i in range(0, self.max_int_bits + 1):
for sign in [-1, 1]:
sign_str = '' if sign == 1 else '-'
for offset in range(-10, 11):

View File

@ -442,7 +442,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
DEPENDS ${IMPLIBSO_SRC} fdb_c
COMMENT "Generating source code for C shim library")
add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack")
target_link_libraries(fdb_c_shim PUBLIC dl)
target_include_directories(fdb_c_shim PUBLIC

View File

@ -21,7 +21,7 @@
#include "fdbclient/FDBTypes.h"
#include "flow/ProtocolVersion.h"
#include <cstdint>
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#define FDB_INCLUDE_LEGACY_TYPES
#include "fdbclient/MultiVersionTransaction.h"
@ -905,6 +905,10 @@ extern "C" DLLEXPORT fdb_error_t fdb_transaction_get_committed_version(FDBTransa
CATCH_AND_RETURN(*out_version = TXN(tr)->getCommittedVersion(););
}
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr) {
return (FDBFuture*)TXN(tr)->getTotalCost().extractPtr();
}
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr) {
return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
}

View File

@ -27,10 +27,10 @@
#endif
#if !defined(FDB_API_VERSION)
#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 720)
#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 730)
#elif FDB_API_VERSION < 13
#error API version no longer supported (upgrade to 13)
#elif FDB_API_VERSION > 720
#elif FDB_API_VERSION > 730
#error Requested API version requires a newer version of this header
#endif
@ -514,12 +514,14 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
int64_t* out_version);
/*
* This function intentionally returns an FDBFuture instead of an integer
* directly, so that calling this API can see the effect of previous
* These functions intentionally return an FDBFuture instead of an integer
* directly, so that calling the API can see the effect of previous
* mutations on the transaction. Specifically, mutations are applied
* asynchronously by the main thread. In order to see them, this call has to
* be serviced by the main thread too.
*/
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);

View File

@ -20,11 +20,14 @@
#include "TesterApiWorkload.h"
#include "TesterBlobGranuleUtil.h"
#include "TesterUtil.h"
#include <unordered_set>
#include <memory>
#include <fmt/format.h>
namespace FdbApiTester {
#define BG_API_DEBUG_VERBOSE false
class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload {
public:
ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
@ -35,7 +38,7 @@ public:
}
private:
// FIXME: use other new blob granule apis!
// FIXME: add tenant support for DB operations
enum OpType {
OP_INSERT,
OP_CLEAR,
@ -51,7 +54,27 @@ private:
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
bool seenReadSuccess = false;
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
tenantDebugString(tenantId),
message));
}
}
void randomReadOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
@ -63,8 +86,10 @@ private:
auto results = std::make_shared<std::vector<fdb::KeyValue>>();
auto tooOld = std::make_shared<bool>(false);
debugOp("Read", begin, end, tenantId, "starting");
execTransaction(
[this, begin, end, results, tooOld](auto ctx) {
[this, begin, end, tenantId, results, tooOld](auto ctx) {
ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
TesterGranuleContext testerContext(ctx->getBGBasePath());
fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
@ -74,8 +99,13 @@ private:
auto out = fdb::Result::KeyValueRefArray{};
fdb::Error err = res.getKeyValueArrayNothrow(out);
if (err.code() == error_code_blob_granule_transaction_too_old) {
info("BlobGranuleCorrectness::randomReadOp bg too old\n");
ASSERT(!seenReadSuccess);
bool previousSuccess = seenReadSuccess(tenantId);
if (previousSuccess) {
error("Read bg too old after read success!\n");
} else {
info("Read bg too old\n");
}
ASSERT(!previousSuccess);
*tooOld = true;
ctx->done();
} else if (err.code() != error_code_success) {
@ -85,10 +115,13 @@ private:
auto& [resVector, out_more] = resCopy;
ASSERT(!out_more);
results.get()->assign(resVector.begin(), resVector.end());
if (!seenReadSuccess) {
info("BlobGranuleCorrectness::randomReadOp first success\n");
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
}
seenReadSuccess = true;
ctx->done();
}
},
@ -97,7 +130,7 @@ private:
std::vector<fdb::KeyValue> expected =
stores[tenantId].getRange(begin, end, stores[tenantId].size(), false);
if (results->size() != expected.size()) {
error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}",
error(fmt::format("randomReadOp result size mismatch. expected: {0} actual: {1}",
expected.size(),
results->size()));
}
@ -105,7 +138,7 @@ private:
for (int i = 0; i < results->size(); i++) {
if ((*results)[i].key != expected[i].key) {
error(fmt::format("randomReadOp key mismatch at {}/{}. expected: {} actual: {}",
error(fmt::format("randomReadOp key mismatch at {0}/{1}. expected: {2} actual: {3}",
i,
results->size(),
fdb::toCharsRef(expected[i].key),
@ -138,6 +171,8 @@ private:
}
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
debugOp("GetGranules", begin, end, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
@ -149,15 +184,17 @@ private:
},
true);
},
[this, begin, end, results, cont]() {
this->validateRanges(results, begin, end, seenReadSuccess);
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetGranules", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
getTenant(tenantId));
}
void randomSummarizeOp(TTaskFct cont, std::optional<int> tenantId) {
if (!seenReadSuccess) {
if (!seenReadSuccess(tenantId)) {
// tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a
// read success
schedule(cont);
@ -169,6 +206,9 @@ private:
std::swap(begin, end);
}
auto results = std::make_shared<std::vector<fdb::GranuleSummary>>();
debugOp("Summarize", begin, end, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType();
@ -180,10 +220,11 @@ private:
},
true);
},
[this, begin, end, results, cont]() {
ASSERT(results->size() > 0);
ASSERT(results->front().keyRange.beginKey <= begin);
ASSERT(results->back().keyRange.endKey >= end);
[this, begin, end, tenantId, results, cont]() {
debugOp("Summarize", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
// use validateRanges to share validation
auto ranges = std::make_shared<std::vector<fdb::KeyRange>>();
for (int i = 0; i < results->size(); i++) {
// TODO: could do validation of subsequent calls and ensure snapshot version never decreases
@ -191,12 +232,11 @@ private:
ASSERT((*results)[i].snapshotVersion <= (*results)[i].deltaVersion);
ASSERT((*results)[i].snapshotSize > 0);
ASSERT((*results)[i].deltaSize >= 0);
ranges->push_back((*results)[i].keyRange);
}
for (int i = 1; i < results->size(); i++) {
// ranges contain entire requested key range
ASSERT((*results)[i].keyRange.beginKey == (*results)[i - 1].keyRange.endKey);
}
this->validateRanges(ranges, begin, end, true);
schedule(cont);
},
@ -208,18 +248,29 @@ private:
fdb::Key end,
bool shouldBeRanges) {
if (shouldBeRanges) {
if (results->size() == 0) {
error(fmt::format(
"ValidateRanges: [{0} - {1}): No ranges returned!", fdb::toCharsRef(begin), fdb::toCharsRef(end)));
}
ASSERT(results->size() > 0);
if (results->front().beginKey > begin || results->back().endKey < end) {
error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(results->front().beginKey),
fdb::toCharsRef(results->back().endKey)));
}
ASSERT(results->front().beginKey <= begin);
ASSERT(results->back().endKey >= end);
}
for (int i = 0; i < results->size(); i++) {
// no empty or inverted ranges
if ((*results)[i].beginKey >= (*results)[i].endKey) {
error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey),
error(fmt::format("ValidateRanges: [{0} - {1}): Empty/inverted range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end)));
fdb::toCharsRef(end),
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey)));
}
ASSERT((*results)[i].beginKey < (*results)[i].endKey);
}
@ -227,16 +278,17 @@ private:
for (int i = 1; i < results->size(); i++) {
// ranges contain entire requested key range
if ((*results)[i].beginKey != (*results)[i].endKey) {
error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey),
error(fmt::format("ValidateRanges: [{0} - {1}): Non-covereed range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end)));
fdb::toCharsRef(end),
fdb::toCharsRef((*results)[i - 1].endKey),
fdb::toCharsRef((*results)[i].endKey)));
}
ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
}
}
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
@ -244,6 +296,10 @@ private:
if (begin > end) {
std::swap(begin, end);
}
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
@ -252,22 +308,27 @@ private:
ctx->done();
});
},
[this, begin, end, results, cont]() {
this->validateRanges(results, begin, end, seenReadSuccess);
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetBlobRanges", begin, end, tenantId, fmt::format("complete with {0} ranges", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
/* failOnError = */ false);
}
// TODO: tenant support
void randomVerifyOp(TTaskFct cont) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) {
std::swap(begin, end);
}
auto verifyVersion = std::make_shared<int64_t>(false);
// info("Verify op starting");
debugOp("Verify", begin, end, tenantId, "starting");
execOperation(
[begin, end, verifyVersion](auto ctx) {
@ -277,16 +338,15 @@ private:
ctx->done();
});
},
[this, begin, end, verifyVersion, cont]() {
[this, begin, end, tenantId, verifyVersion, cont]() {
debugOp("Verify", begin, end, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
bool previousSuccess = seenReadSuccess(tenantId);
if (*verifyVersion == -1) {
ASSERT(!seenReadSuccess);
} else {
if (!seenReadSuccess) {
info("BlobGranuleCorrectness::randomVerifyOp first success");
}
seenReadSuccess = true;
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
setReadSuccess(tenantId);
}
// info(fmt::format("verify op done @ {}", *verifyVersion));
schedule(cont);
},
/* failOnError = */ false);

View File

@ -27,7 +27,7 @@
#include <unordered_map>
#include <vector>
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
namespace FdbApiTester {

View File

@ -36,7 +36,7 @@ namespace FdbApiTester {
namespace {
#define API_VERSION_CLIENT_TMP_DIR 720
#define API_VERSION_CLIENT_TMP_DIR 730
enum TesterOptionId {
OPT_CONNFILE,
@ -459,8 +459,10 @@ int main(int argc, char** argv) {
retCode = 1;
}
fprintf(stderr, "Stopping FDB network thread\n");
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
fprintf(stderr, "FDB network thread successfully stopped\n");
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1;

View File

@ -18,7 +18,7 @@
* limitations under the License.
*/
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include "unit/fdb_api.hpp"

View File

@ -23,7 +23,7 @@
#pragma once
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <cassert>
@ -716,6 +716,12 @@ public:
throwError("Failed to create transaction: ", err);
return Transaction(tx_native);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
};
class Database {

View File

@ -283,24 +283,60 @@ int populate(Database db,
int batch_size = args.tenant_batch_size;
int batches = (args.total_tenants + batch_size - 1) / batch_size;
for (int batch = 0; batch < batches; ++batch) {
while (1) {
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
std::string tenant_str = "tenant" + std::to_string(i);
Tenant::createTenant(systemTx, toBytesRef(tenant_str));
}
auto future_commit = systemTx.commit();
const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
if (rc == FutureRC::OK) {
// Keep going with reset transaction if commit was successful
systemTx.reset();
break;
} else if (rc == FutureRC::RETRY) {
// We want to retry this batch. Transaction is already reset
} else {
// Abort
return -1;
}
}
Tenant tenants[batch_size];
fdb::TypedFuture<fdb::future_var::Bool> blobbifyResults[batch_size];
// blobbify tenant ranges explicitly
// FIXME: skip if database not configured for blob granules?
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
std::string tenant_name = "tenant" + std::to_string(i);
Tenant::createTenant(systemTx, toBytesRef(tenant_name));
std::string tenant_str = "tenant" + std::to_string(i);
BytesRef tenant_name = toBytesRef(tenant_str);
tenants[i] = db.openTenant(tenant_name);
std::string rangeEnd = "\xff";
blobbifyResults[i - (batch * batch_size)] =
tenants[i].blobbifyRange(BytesRef(), toBytesRef(rangeEnd));
}
auto future_commit = systemTx.commit();
const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
if (rc == FutureRC::OK) {
// Keep going with reset transaction if commit was successful
systemTx.reset();
} else if (rc == FutureRC::RETRY) {
// We want to retry this batch, so decrement the number
// and go back through the loop to get the same value
// Transaction is already reset
--batch;
} else {
// Abort
return -1;
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
while (true) {
// not technically an operation that's part of systemTx, but it works
const auto rc =
waitAndHandleError(systemTx, blobbifyResults[i - (batch * batch_size)], "BLOBBIFY_TENANT");
if (rc == FutureRC::OK) {
if (!blobbifyResults[i - (batch * batch_size)].get()) {
fmt::print("Blobbifying tenant {0} failed!\n", i);
return -1;
}
break;
} else if (rc == FutureRC::RETRY) {
continue;
} else {
// Abort
return -1;
}
}
}
systemTx.reset();
}
} else {
std::string last_tenant_name = "tenant" + std::to_string(args.total_tenants - 1);
@ -1261,7 +1297,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
/* name, has_arg, flag, val */
{ "api_version", required_argument, NULL, 'a' },
{ "cluster", required_argument, NULL, 'c' },
{ "num_databases", optional_argument, NULL, 'd' },
{ "num_databases", required_argument, NULL, 'd' },
{ "procs", required_argument, NULL, 'p' },
{ "threads", required_argument, NULL, 't' },
{ "async_xacts", required_argument, NULL, ARG_ASYNC },
@ -1312,6 +1348,17 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
{ NULL, 0, NULL, 0 }
};
/* For optional arguments, optarg is only set when the argument is passed as "--option=[ARGUMENT]" but not as
"--option [ARGUMENT]". This function sets optarg in the latter case. See
https://cfengine.com/blog/2021/optional-arguments-with-getopt-long/ for a more detailed explanation */
#define SET_OPT_ARG_IF_PRESENT() \
{ \
if (optarg == NULL && optind < argc && argv[optind][0] != '-') { \
optarg = argv[optind++]; \
} \
}
idx = 0;
c = getopt_long(argc, argv, short_options, long_options, &idx);
if (c < 0) {
@ -1513,9 +1560,8 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.disable_ryw = 1;
break;
case ARG_JSON_REPORT:
if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
// if --report_json is the last option and no file is specified
// or --report_json is followed by another option
SET_OPT_ARG_IF_PRESENT();
if (!optarg) {
char default_file[] = "mako.json";
strncpy(args.json_output_path, default_file, sizeof(default_file));
} else {
@ -1526,13 +1572,12 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.bg_materialize_files = true;
strncpy(args.bg_file_path, optarg, std::min(sizeof(args.bg_file_path), strlen(optarg) + 1));
case ARG_EXPORT_PATH:
if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
SET_OPT_ARG_IF_PRESENT();
if (!optarg) {
char default_file[] = "sketch_data.json";
strncpy(args.stats_export_path, default_file, sizeof(default_file));
} else {
strncpy(args.stats_export_path,
argv[optind],
std::min(sizeof(args.stats_export_path), strlen(argv[optind]) + 1));
strncpy(args.stats_export_path, optarg, std::min(sizeof(args.stats_export_path), strlen(optarg) + 1));
}
break;
case ARG_DISTRIBUTED_TRACER_CLIENT:

View File

@ -22,7 +22,7 @@
#define MAKO_HPP
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <array>

View File

@ -29,7 +29,7 @@
#include <inttypes.h>
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <foundationdb/fdb_c.h>

View File

@ -20,7 +20,7 @@
// Unit tests that test the timeouts for a disconnected cluster
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <chrono>

View File

@ -231,6 +231,10 @@ Int64Future Transaction::get_approximate_size() {
return Int64Future(fdb_transaction_get_approximate_size(tr_));
}
Int64Future Transaction::get_total_cost() {
return Int64Future(fdb_transaction_get_total_cost(tr_));
}
KeyFuture Transaction::get_versionstamp() {
return KeyFuture(fdb_transaction_get_versionstamp(tr_));
}

View File

@ -39,7 +39,7 @@
#pragma once
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <string>
@ -276,6 +276,9 @@ public:
// Returns a future which will be set to the approximate transaction size so far.
Int64Future get_approximate_size();
// Returns a future which will be set tot the transaction's total cost so far.
Int64Future get_total_cost();
// Returns a future which will be set to the versionstamp which was used by
// any versionstamp operations in the transaction.
KeyFuture get_versionstamp();

View File

@ -20,7 +20,7 @@
// Unit tests for API setup, network initialization functions from the FDB C API.
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <iostream>
#include <thread>

View File

@ -21,7 +21,7 @@
// Unit tests for the FoundationDB C API.
#include "fdb_c_options.g.h"
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <assert.h>
#include <string.h>
@ -1945,6 +1945,30 @@ TEST_CASE("fdb_transaction_get_committed_version") {
}
}
TEST_CASE("fdb_transaction_get_total_cost") {
fdb::Transaction tr(db);
while (1) {
fdb::ValueFuture f1 = tr.get("foo", /*snapshot*/ false);
fdb_error_t err = wait_future(f1);
if (err) {
fdb::EmptyFuture fOnError = tr.on_error(err);
fdb_check(wait_future(fOnError));
continue;
}
fdb::Int64Future f2 = tr.get_total_cost();
err = wait_future(f2);
if (err) {
fdb::EmptyFuture fOnError = tr.on_error(err);
fdb_check(wait_future(fOnError));
continue;
}
int64_t cost;
fdb_check(f2.get(&cost));
CHECK(cost > 0);
break;
}
}
TEST_CASE("fdb_transaction_get_approximate_size") {
fdb::Transaction tr(db);
while (1) {

View File

@ -18,7 +18,7 @@
* limitations under the License.
*/
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include "foundationdb/fdb_c.h"
#undef DLLEXPORT
#include "workloads.h"

View File

@ -76,38 +76,11 @@ function(generate_coverage_xml)
add_dependencies(coverage_${target_name} coveragetool)
endfunction()
# This function asserts that `versions.h` does not exist in the source
# directory. It does this in the prebuild phase of the target.
# This is an ugly hack that should make sure that cmake isn't used with
# a source directory in which FDB was previously built with `make`.
function(assert_no_version_h target)
message(STATUS "Check versions.h on ${target}")
set(target_name "${target}_versions_h_check")
if (DEFINED ENV{VERBOSE})
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMAND echo
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-DFILE="${CMAKE_SOURCE_DIR}/versions.h"
COMMENT "Check old build system wasn't used in source dir")
else()
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMENT "Check old build system wasn't used in source dir")
endif()
add_dependencies(${target} ${target_name})
endfunction()
add_custom_target(strip_targets)
add_dependencies(packages strip_targets)
function(strip_debug_symbols target)
if (WIN32)
if(WIN32)
return()
endif()
get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
COMMENT "Copy debug symbols to ${out_name}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
else()
add_custom_target(strip_${target})
add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
foreach(f IN LISTS CP_SRCS)
is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
if (bd OR sd)
if(bd OR sd)
continue()
endif()
is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
endif()
get_filename_component(fname ${f} NAME)
get_filename_component(dname ${f} DIRECTORY)
if (dname)
if(dname)
make_directory(${incl_dir}/${dname})
endif()
set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)
add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
if(NOT WIN32)
assert_no_version_h(${AFT_NAME}_actors)
endif()
generate_coverage_xml(${AFT_NAME})
if(strip_target)
strip_debug_symbols(${AFT_NAME})

View File

@ -8,40 +8,43 @@ endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
# it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
# This option forces to cmake to build the aws sdk only once and never attempt to update it
UPDATE_DISCONNECTED ON
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)

View File

@ -303,7 +303,6 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -315,7 +314,7 @@ class TestRun:
# state for the run
self.retryable_error: bool = False
self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
will_restart=will_restart)
will_restart=will_restart, long_running=config.long_running)
self.run_time: int = 0
self.success = self.run()
@ -367,6 +366,11 @@ class TestRun:
command += ['-b', 'on']
if config.crash_on_error:
command.append('--crash')
if config.long_running:
# disable simulation speedup
command += ['--knob-sim-speedup-after-seconds=36000']
# disable traceTooManyLines Error MAX_TRACE_LINES
command += ['--knob-max-trace-lines=1000000000']
self.temp_path.mkdir(parents=True, exist_ok=True)
@ -376,7 +380,8 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
# No timeout for long running tests
timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -291,11 +291,12 @@ class Summary:
def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None,
was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None,
exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None,
error_out: str = None, will_restart: bool = False):
error_out: str = None, will_restart: bool = False, long_running: bool = False):
self.binary = binary
self.runtime: float = runtime
self.max_rss: int | None = max_rss
self.was_killed: bool = was_killed
self.long_running = long_running
self.expected_unseed: int | None = expected_unseed
self.exit_code: int = exit_code
self.out: SummaryTree = SummaryTree('Test')
@ -396,6 +397,10 @@ class Summary:
if self.was_killed:
child = SummaryTree('ExternalTimeout')
child.attributes['Severity'] = '40'
if self.long_running:
# debugging info for long-running tests
child.attributes['LongRunning'] = '1'
child.attributes['Runtime'] = str(self.runtime)
self.out.append(child)
self.error = True
if self.max_rss is not None:

View File

@ -55,6 +55,6 @@ if __name__ == '__main__':
summary.summarize_files(files)
summary.out.dump(sys.stdout)
else:
summary = Summary(Path('bin/fdbserver'), was_killed=True)
summary = Summary(Path('bin/fdbserver'), was_killed=True, long_running=config.long_running)
summary.summarize_files(files)
summary.out.dump(sys.stdout)

View File

@ -11,16 +11,16 @@ The global tag throttler bases throttling decisions on "quotas" provided by clie
The global tag throttler cannot throttle tags to a throughput below the reserved quota, and it cannot allow throughput to exceed the total quota.
### Cost
Internally, the units for these quotas are "page costs", computed as follows. The "page cost" of a read operation is computed as:
Internally, the units for these quotas are bytes. The cost of an operation is rounded up to the nearest page size. The cost of a read operation is computed as:
```
readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
```
The "page cost" of a write operation is computed as:
The cost of a write operation is computed as:
```
writeCost = SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR);
writeCost = CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
```
Here `bytesWritten` includes cleared bytes. The size of range clears is estimated at commit time.
@ -41,12 +41,6 @@ To set the quota through `fdbcli`, run:
fdbcli> quota set <tag> [reserved_throughput|total_throughput] <bytes_per_second>
```
Note that the quotas are specified in terms of bytes/second, and internally converted to page costs:
```
page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR)
```
To clear a both reserved and total throughput quotas for a tag, run:
```

View File

@ -34,20 +34,25 @@ Commit proxies would combine idempotency IDs for transactions within a batch. Th
## Value format
```
${protocol_version}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
${protocol_version}${timestamp}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
```
The batch index for each idempotency id can be reconstructed from the high order byte and low order bytes stored in the key and value, respectively. This is necessary for an "unknown_committed" transaction to recover their full version stamp. Batch index is a `short int`, i.e. 2 bytes.
The timestamp is the unix epoch stored as a little-endian signed 64-bit integer.
# Cleaning up old idempotency ids
After learning the result of an attempt to commit a transaction with an
idempotency id, the client may inform the cluster that it's no longer interested
in that id and the cluster can reclaim the space used to store the idempotency
id. The happy-path reply to a CommitTransactionRequest will say which proxy this
request should be sent to, and all idempotency ids for a database key will be
sent to the same proxy so that it can clear the key once it receives all of
them. The first proxy will also periodically clean up the oldest idempotency ids, based on a policy determined by two knobs. One knob will control the minimum lifetime of an idempotency id (i.e. don't delete anything younger than 1 day), and the other will control the target byte size of the idempotency keys (e.g. keep 100 MB of idempotency keys around).
id. The commit proxy that committed a batch is responsible for cleaning all
idempotency kv pairs from that batch, so clients must tell that specific proxy
that they're done with the id. The first proxy will also periodically clean up
the oldest idempotency ids, based on a policy determined by two knobs. One knob
will control the minimum lifetime of an idempotency id (i.e. don't delete
anything younger than 1 day), and the other will control the target byte size of
the idempotency keys (e.g. keep 100 MB of idempotency keys around).
# Commit protocol

View File

@ -49,7 +49,7 @@ master_doc = 'index'
# General information about the project.
project = u'FoundationDB'
copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
copyright = u'2013-2022 Apple, Inc and the FoundationDB project authors'
# Load the version information from 'versions.target'
import xml.etree.ElementTree as ET

View File

@ -142,6 +142,8 @@ Here is a complete list of valid parameters:
*multipart_min_part_size* (or *minps*) - Min part size for multipart uploads.
*enable_read_cache* (or *erc*) - Whether to enable read block cache.
*read_block_size* (or *rbs*) - Block size in bytes to be used for reads.
*read_ahead_blocks* (or *rab*) - Number of blocks to read ahead of requested offset.

View File

@ -2,6 +2,12 @@
Release Notes
#############
6.3.25
======
* Fixed a transaction log data corruption bug. `(PR #8558) <https://github.com/apple/foundationdb/pull/8558>`_
* Fixed a special keyspace ``SpecialKeyRangeAsyncImpl::getRange`` bug. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_
* Fixed a special keyspace ``ConflictingKeysImpl::getRange`` bug. `(PR #7724) <https://github.com/apple/foundationdb/pull/7724>`_
6.3.24
======
* Fixed a bug where get key location can overload proxies. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_

View File

@ -2,6 +2,24 @@
Release Notes
#############
7.1.25
======
* Same as 7.1.24 release with AVX enabled.
7.1.24
======
* Released with AVX disabled.
* Fixed a transaction log data corruption bug. `(PR #8525) <https://github.com/apple/foundationdb/pull/8525>`_, `(PR #8562) <https://github.com/apple/foundationdb/pull/8562>`_, and `(PR #8647) <https://github.com/apple/foundationdb/pull/8647>`_
* Fixed a rare data race in transaction logs when PEEK_BATCHING_EMPTY_MSG is enabled. `(PR #8660) <https://github.com/apple/foundationdb/pull/8660>`_
* Changed consistency check to report all corruptions. `(PR #8571) <https://github.com/apple/foundationdb/pull/8571>`_
* Fixed a rare storage server crashing bug after recovery. `(PR #8468) <https://github.com/apple/foundationdb/pull/8468>`_
* Added client knob UNLINKONLOAD_FDBCLIB to control deletion of external client libraries. `(PR #8434) <https://github.com/apple/foundationdb/pull/8434>`_
* Updated the default peer latency degradation percentile to 0.5. `(PR #8370) <https://github.com/apple/foundationdb/pull/8370>`_
* Made exclusion less pessimistic when warning about low space usage. `(PR #8347) <https://github.com/apple/foundationdb/pull/8347>`_
* Added storage server readrange and update latency metrics. `(PR #8353) <https://github.com/apple/foundationdb/pull/8353>`_
* Increased the default PEER_DEGRADATION_CONNECTION_FAILURE_COUNT value to 5s. `(PR #8336) <https://github.com/apple/foundationdb/pull/8336>`_
* Increased RocksDB block cache size. `(PR #8274) <https://github.com/apple/foundationdb/pull/8274>`_
7.1.23
======
* Same as 7.1.22 release with AVX enabled.

View File

@ -43,9 +43,9 @@ Optional<LimitType> parseLimitType(StringRef token) {
}
}
Optional<double> parseLimitValue(StringRef token) {
Optional<int64_t> parseLimitValue(StringRef token) {
try {
return std::stod(token.toString());
return std::stol(token.toString());
} catch (...) {
return {};
}
@ -63,9 +63,9 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
} else {
auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (limitType == LimitType::TOTAL) {
fmt::print("{}\n", quota.totalQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
fmt::print("{}\n", quota.totalQuota);
} else if (limitType == LimitType::RESERVED) {
fmt::print("{}\n", quota.reservedQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
fmt::print("{}\n", quota.reservedQuota);
}
}
return Void();
@ -75,7 +75,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
}
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, int64_t value) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -89,9 +89,13 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (limitType == LimitType::TOTAL) {
quota.totalQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
// Round up to nearest page size
quota.totalQuota =
((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
} else if (limitType == LimitType::RESERVED) {
quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
// Round up to nearest page size
quota.reservedQuota =
((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();

View File

@ -175,11 +175,13 @@ Future<Reference<IAsyncFile>> BackupContainerS3BlobStore::readFile(const std::st
if (usesEncryption()) {
f = makeReference<AsyncFileEncrypted>(f, AsyncFileEncrypted::Mode::READ_ONLY);
}
f = makeReference<AsyncFileReadAheadCache>(f,
m_bstore->knobs.read_block_size,
m_bstore->knobs.read_ahead_blocks,
m_bstore->knobs.concurrent_reads_per_file,
m_bstore->knobs.read_cache_blocks_per_file);
if (m_bstore->knobs.enable_read_cache) {
f = makeReference<AsyncFileReadAheadCache>(f,
m_bstore->knobs.read_block_size,
m_bstore->knobs.read_ahead_blocks,
m_bstore->knobs.concurrent_reads_per_file,
m_bstore->knobs.read_cache_blocks_per_file);
}
return f;
}

View File

@ -76,6 +76,10 @@ BlobCipherMetrics::BlobCipherMetrics()
UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
getBlobMetadataLatency("GetBlobMetadataLatency",
UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
counterSets({ CounterSet(cc, "TLog"),
CounterSet(cc, "KVMemory"),
CounterSet(cc, "KVRedwood"),

View File

@ -232,10 +232,10 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys,
.detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt);
throw encrypt_header_metadata_mismatch();
}
// Validate encryption header 'cipherHeader' details sanity
if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
// Validate encryption header 'cipherText' details sanity
if (!(header.cipherTextDetails.baseCipherId == eKeys.textCipherKey->getBaseCipherId() &&
header.cipherTextDetails.encryptDomainId == eKeys.textCipherKey->getDomainId() &&
header.cipherTextDetails.salt == eKeys.textCipherKey->getSalt())) {
TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch")
.detail("TextDomainId", eKeys.textCipherKey->getDomainId())
.detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId)
@ -650,12 +650,12 @@ struct IndexedBlobGranuleFile {
IndexBlobGranuleFileChunkRef chunkRef =
IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena);
ChildType child;
ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), IncludeVersion());
dataReader.deserialize(FileIdentifierFor<ChildType>::value, child, childArena);
// TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused?
return Standalone<ChildType>(child, childArena);
BinaryReader br(chunkRef.chunkBytes.get(), IncludeVersion());
Standalone<ChildType> child;
br >> child;
return child;
}
template <class Ar>
@ -751,7 +751,7 @@ Value serializeChunkedSnapshot(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) {
Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes);
@ -1020,7 +1020,7 @@ Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) {
Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes);

View File

@ -0,0 +1,109 @@
/*
* BlobMetadataUtils.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobMetadataUtils.h"
#include "fmt/format.h"
#include "flow/IRandom.h"
#include "flow/flow.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/S3BlobStore.h"
std::string buildPartitionPath(const std::string& url, const std::string& partition) {
ASSERT(!partition.empty());
ASSERT(partition.front() != '/');
ASSERT(partition.back() == '/');
StringRef u(url);
if (u.startsWith("file://"_sr)) {
ASSERT(u.endsWith("/"_sr));
return url + partition;
} else if (u.startsWith("blobstore://"_sr)) {
std::string resource;
std::string lastOpenError;
S3BlobStoreEndpoint::ParametersT backupParams;
std::string urlCopy = url;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(url, {}, &resource, &lastOpenError, &backupParams);
ASSERT(!resource.empty());
ASSERT(resource.back() != '/');
size_t resourceStart = url.find(resource);
ASSERT(resourceStart != std::string::npos);
return urlCopy.insert(resourceStart + resource.size(), "/" + partition);
} else {
// FIXME: support azure
throw backup_invalid_url();
}
}
// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName) {
Standalone<BlobMetadataDetailsRef> metadata;
metadata.domainId = domainId;
metadata.arena().dependsOn(domainName.arena());
metadata.domainName = domainName;
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
TraceEvent ev(SevDebug, "SimBlobMetadata");
ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
if (type == 0) {
// single storage location
std::string partition = std::to_string(domainId) + "/";
metadata.base = StringRef(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("Base", metadata.base);
}
if (type == 1) {
// simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), baseUrl);
ev.detail("Base", metadata.base);
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/");
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
if (type == 2) {
// simulate separate storage location per partition
for (int i = 0; i < partitionCount; i++) {
std::string partition = std::to_string(domainId) + "_" + std::to_string(i) + "/";
metadata.partitions.push_back_deep(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
// set random refresh + expire time
if (deterministicRandom()->coinflip()) {
metadata.refreshAt = now() + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
metadata.expireAt =
metadata.refreshAt + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
} else {
metadata.refreshAt = std::numeric_limits<double>::max();
metadata.expireAt = metadata.refreshAt;
}
return metadata;
}

View File

@ -61,7 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
init( GRV_ERROR_RETRY_DELAY, 5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01();
init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
init( UNKNOWN_TENANT_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = 0.01 + deterministicRandom()->random01();
init( REPLY_BYTE_LIMIT, 80000 );
init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
init( DEFAULT_MAX_BACKOFF, 1.0 );
@ -220,6 +220,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5 );
init( BLOBSTORE_CONCURRENT_READS_PER_FILE, 3 );
init( BLOBSTORE_ENABLE_READ_CACHE, true );
init( BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024 );
init( BLOBSTORE_READ_AHEAD_BLOCKS, 0 );
init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 );
@ -272,7 +273,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0;
init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
// busyness reporting
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );
@ -281,6 +282,7 @@ void ClientKnobs::initialize(Randomize randomize) {
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
init( BG_TOO_MANY_GRANULES, 10000 );
init( BLOB_METADATA_REFRESH_INTERVAL, 3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 );
init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 );

View File

@ -658,7 +658,7 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
parse((&type), value);
blobGranulesEnabled = (type != 0);
} else if (ck == "encryption_at_rest_mode"_sr) {
encryptionAtRestMode = EncryptionAtRestMode::fromValue(value);
encryptionAtRestMode = EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(value));
} else {
return false;
}

View File

@ -1750,7 +1750,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
state bool done = false;
state int64_t nrKeys = 0;
state bool encryptionEnabled = false;
state Optional<bool> encryptionEnabled;
loop {
state RangeResultWithVersion values;
@ -1816,7 +1816,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(taskBucket->keepRunning(tr, task) &&
storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) &&
storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)));
break;
@ -1829,9 +1829,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize));
outFile = f;
encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled;
const bool encrypted =
encryptionEnabled.present() && encryptionEnabled.get() && cx->clientInfo->get().isEncryptionEnabled;
// Initialize range file writer and write begin key
if (encryptionEnabled) {
if (encrypted) {
CODE_PROBE(true, "using encrypted snapshot file writer");
if (!tenantCache.isValid()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.cpp
* IdempotencyId.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
@ -18,9 +18,11 @@
* limitations under the License.
*/
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct IdempotencyIdKVBuilderImpl {
Optional<Version> commitVersion;
@ -40,6 +42,7 @@ void IdempotencyIdKVBuilder::add(const IdempotencyIdRef& id, uint16_t batchIndex
ASSERT((batchIndex >> 8) == impl->batchIndexHighOrderByte.get());
} else {
impl->batchIndexHighOrderByte = batchIndex >> 8;
impl->value << int64_t(now());
}
StringRef s = id.asStringRefUnsafe();
impl->value << uint8_t(s.size());
@ -53,19 +56,17 @@ Optional<KeyValue> IdempotencyIdKVBuilder::buildAndClear() {
return {};
}
BinaryWriter key{ Unversioned() };
key.serializeBytes(idempotencyIdKeys.begin);
key << bigEndian64(impl->commitVersion.get());
key << impl->batchIndexHighOrderByte.get();
Value v = impl->value.toValue();
KeyRef key =
makeIdempotencySingleKeyRange(v.arena(), impl->commitVersion.get(), impl->batchIndexHighOrderByte.get()).begin;
impl->value = BinaryWriter(IncludeVersion());
impl->batchIndexHighOrderByte = Optional<uint8_t>();
Optional<KeyValue> result = KeyValue();
result.get().arena() = v.arena();
result.get().key = key.toValue(result.get().arena());
result.get().key = key;
result.get().value = v;
return result;
}
@ -86,6 +87,8 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
// Even if id is a substring of value, it may still not actually contain it.
BinaryReader reader(kv.value.begin(), kv.value.size(), IncludeVersion());
int64_t timestamp; // ignored
reader >> timestamp;
while (!reader.empty()) {
uint8_t length;
reader >> length;
@ -93,13 +96,9 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
uint8_t lowOrderBatchIndex;
reader >> lowOrderBatchIndex;
if (candidate == needle) {
BinaryReader reader(kv.key.begin(), kv.key.size(), Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
Version commitVersion;
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
uint8_t highOrderBatchIndex;
reader >> highOrderBatchIndex;
decodeIdempotencyKey(kv.key, commitVersion, highOrderBatchIndex);
return CommitResult{ commitVersion,
static_cast<uint16_t>((uint16_t(highOrderBatchIndex) << 8) |
uint16_t(lowOrderBatchIndex)) };
@ -172,4 +171,35 @@ TEST_CASE("/fdbclient/IdempotencyId/serialization") {
ASSERT(t == id);
}
return Void();
}
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex) {
static const auto size =
idempotencyIdKeys.begin.size() + sizeof(version) + sizeof(highOrderBatchIndex) + /*\x00*/ 1;
StringRef second = makeString(size, arena);
auto* dst = mutateString(second);
memcpy(dst, idempotencyIdKeys.begin.begin(), idempotencyIdKeys.begin.size());
dst += idempotencyIdKeys.begin.size();
version = bigEndian64(version);
memcpy(dst, &version, sizeof(version));
dst += sizeof(version);
*dst++ = highOrderBatchIndex;
*dst++ = 0;
ASSERT_EQ(dst - second.begin(), size);
return KeyRangeRef(second.removeSuffix("\x00"_sr), second);
}
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex) {
BinaryReader reader(key, Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
reader >> highOrderBatchIndex;
}

View File

@ -2639,7 +2639,8 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") {
ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource),
"",
"",
currentProtocolVersion());
currentProtocolVersion(),
false);
}
workers.push_back(data);

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "flow/Trace.h"
#ifdef ADDRESS_SANITIZER
#include <sanitizer/lsan_interface.h>
#endif
@ -414,6 +415,20 @@ Version DLTransaction::getCommittedVersion() {
return version;
}
ThreadFuture<int64_t> DLTransaction::getTotalCost() {
if (!api->transactionGetTotalCost) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->transactionGetTotalCost(tr);
return toThreadFuture<int64_t>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
int64_t size = 0;
FdbCApi::fdb_error_t error = api->futureGetInt64(f, &size);
ASSERT(!error);
return size;
});
}
ThreadFuture<int64_t> DLTransaction::getApproximateSize() {
if (!api->transactionGetApproximateSize) {
return unsupported_operation();
@ -950,6 +965,11 @@ void DLApi::init() {
fdbCPath,
"fdb_transaction_get_committed_version",
headerVersion >= 0);
loadClientFunction(&api->transactionGetTotalCost,
lib,
fdbCPath,
"fdb_transaction_get_total_cost",
headerVersion >= ApiVersion::withGetTotalCost().version());
loadClientFunction(&api->transactionGetApproximateSize,
lib,
fdbCPath,
@ -1486,6 +1506,12 @@ ThreadFuture<SpanContext> MultiVersionTransaction::getSpanContext() {
return SpanContext();
}
ThreadFuture<int64_t> MultiVersionTransaction::getTotalCost() {
auto tr = getTransaction();
auto f = tr.transaction ? tr.transaction->getTotalCost() : makeTimeout<int64_t>();
return abortableFuture(f, tr.onChange);
}
ThreadFuture<int64_t> MultiVersionTransaction::getApproximateSize() {
auto tr = getTransaction();
auto f = tr.transaction ? tr.transaction->getApproximateSize() : makeTimeout<int64_t>();
@ -1863,6 +1889,9 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
TraceEvent("UnknownDatabaseOption").detail("Option", option);
throw invalid_option();
}
if (itr->first == FDBDatabaseOptions::USE_CONFIG_DATABASE) {
dbState->isConfigDB = true;
}
int defaultFor = itr->second.defaultFor;
if (defaultFor >= 0) {
@ -1969,7 +1998,7 @@ ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<P
MultiVersionDatabase::DatabaseState::DatabaseState(ClusterConnectionRecord const& connectionRecord,
Reference<IDatabase> versionMonitorDb)
: dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))),
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false) {}
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false), isConfigDB(false) {}
// Adds a client (local or externally loaded) that can be used to connect to the cluster
void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
@ -2167,8 +2196,12 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
.detail("ConnectionRecord", connectionRecord);
}
}
// Verify the database has the necessary functionality to update the shared
// state. Avoid updating the shared state if the database is a
// configuration database, because a configuration database does not have
// access to typical system keys and does not need to be updated.
if (db.isValid() && dbProtocolVersion.present() &&
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) {
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap() && !isConfigDB) {
Future<std::string> updateResult =
MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db);
sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr<std::string> result) {
@ -2780,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
});
}
localClient->api->runNetwork();
try {
localClient->api->runNetwork();
} catch (const Error& e) {
closeTraceFile();
throw e;
}
for (auto h : handles) {
waitThread(h);
}
TraceEvent("MultiVersionRunNetworkTerminating");
closeTraceFile();
}
void MultiVersionApi::stopNetwork() {

View File

@ -3456,6 +3456,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
}
trState->cx->getValueCompleted->latency = timer_int() - startTime;
trState->cx->getValueCompleted->log();
trState->totalCost +=
getReadOperationCost(key.size() + (reply.value.present() ? reply.value.get().size() : 0));
if (getValueID.present()) {
g_traceBatch.addEvent("GetValueDebug",
@ -4015,6 +4017,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
req.version = version;
req.begin = firstGreaterOrEqual(range.begin);
req.end = firstGreaterOrEqual(range.end);
setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
req.spanContext = span.context;
trState->cx->getLatestCommitVersions(
@ -4284,6 +4287,7 @@ void getRangeFinished(Reference<TransactionState> trState,
RangeResultFamily result) {
int64_t bytes = getRangeResultFamilyBytes(result);
trState->totalCost += getReadOperationCost(bytes);
trState->cx->transactionBytesRead += bytes;
trState->cx->transactionKeysRead += result.size();
@ -5766,6 +5770,7 @@ void Transaction::set(const KeyRef& key, const ValueRef& value, AddConflictRange
auto r = singleKeyRange(key, req.arena);
auto v = ValueRef(req.arena, value);
t.mutations.emplace_back(req.arena, MutationRef::SetValue, r.begin, v);
trState->totalCost += getWriteOperationCost(key.expectedSize() + value.expectedSize());
if (addConflictRange) {
t.write_conflict_ranges.push_back(req.arena, r);
@ -5795,6 +5800,7 @@ void Transaction::atomicOp(const KeyRef& key,
auto v = ValueRef(req.arena, operand);
t.mutations.emplace_back(req.arena, operationType, r.begin, v);
trState->totalCost += getWriteOperationCost(key.expectedSize());
if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
t.write_conflict_ranges.push_back(req.arena, r);
@ -5826,7 +5832,10 @@ void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRa
return;
t.mutations.emplace_back(req.arena, MutationRef::ClearRange, r.begin, r.end);
// NOTE: The throttling cost of each clear is assumed to be one page.
// This makes compuation fast, but can be inaccurate and may
// underestimate the cost of large clears.
trState->totalCost += CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
if (addConflictRange)
t.write_conflict_ranges.push_back(req.arena, r);
}
@ -6142,6 +6151,7 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
IdempotencyIdRef idempotencyId) {
state Transaction tr(trState->cx);
state int retries = 0;
state Version expiredVersion;
state Span span("NAPI:determineCommitStatus"_loc, trState->spanContext);
tr.span.setParent(span.context);
loop {
@ -6151,11 +6161,19 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
tr.trState->authToken = trState->authToken;
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
KeyBackedObjectProperty<IdempotencyIdsExpiredVersion, _Unversioned> expiredKey(idempotencyIdsExpiredVersion,
Unversioned());
IdempotencyIdsExpiredVersion expiredVal = wait(expiredKey.getD(&tr));
expiredVersion = expiredVal.expired;
if (expiredVersion >= minPossibleCommitVersion) {
throw commit_unknown_result_fatal();
}
Version rv = wait(tr.getReadVersion());
TraceEvent("DetermineCommitStatusAttempt")
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries)
.detail("ReadVersion", rv)
.detail("ExpiredVersion", expiredVersion)
.detail("MinPossibleCommitVersion", minPossibleCommitVersion)
.detail("MaxPossibleCommitVersion", maxPossibleCommitVersion);
KeyRange possibleRange =
@ -6230,14 +6248,14 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
state int i = 0;
for (; i < transaction->mutations.size(); ++i) {
auto* it = &transaction->mutations[i];
auto const& mutation = transaction->mutations[i];
if (it->type == MutationRef::Type::SetValue || it->isAtomicOp()) {
if (mutation.type == MutationRef::Type::SetValue || mutation.isAtomicOp()) {
trCommitCosts.opsCount++;
trCommitCosts.writeCosts += getWriteOperationCost(it->expectedSize());
} else if (it->type == MutationRef::Type::ClearRange) {
trCommitCosts.writeCosts += getWriteOperationCost(mutation.expectedSize());
} else if (mutation.type == MutationRef::Type::ClearRange) {
trCommitCosts.opsCount++;
keyRange = KeyRangeRef(it->param1, it->param2);
keyRange = KeyRangeRef(mutation.param1, mutation.param2);
if (trState->options.expensiveClearCostEstimation) {
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
@ -6366,8 +6384,11 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
}
if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) {
wait(store(req.transaction.read_snapshot, readVersion) &&
store(req.commitCostEstimation, estimateCommitCosts(trState, &req.transaction)));
state Future<Optional<ClientTrCommitCostEstimation>> commitCostFuture =
estimateCommitCosts(trState, &req.transaction);
// We need to wait for the read version first so that we can be notified if the database is locked
wait(store(req.transaction.read_snapshot, readVersion));
wait(store(req.commitCostEstimation, commitCostFuture));
} else {
wait(store(req.transaction.read_snapshot, readVersion));
}
@ -6399,6 +6420,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
req.debugID = commitID;
state Future<CommitID> reply;
// Only gets filled in in the happy path where we don't have to commit on the first proxy or use provisional
// proxies
state int alternativeChosen = -1;
// Only valid if alternativeChosen >= 0
state Reference<CommitProxyInfo> proxiesUsed;
if (trState->options.commitOnFirstProxy) {
if (trState->cx->clientInfo->get().firstCommitProxy.present()) {
reply = throwErrorOr(brokenPromiseToMaybeDelivered(
@ -6409,11 +6436,13 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
: Never();
}
} else {
reply = basicLoadBalance(trState->cx->getCommitProxies(trState->useProvisionalProxies),
proxiesUsed = trState->cx->getCommitProxies(trState->useProvisionalProxies);
reply = basicLoadBalance(proxiesUsed,
&CommitProxyInterface::commit,
req,
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::True);
AtMostOnce::True,
&alternativeChosen);
}
state double grvTime = now();
choose {
@ -6463,6 +6492,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
ci.version,
req,
trState->tenant()));
if (trState->automaticIdempotency && alternativeChosen >= 0) {
// Automatic idempotency means we're responsible for best effort idempotency id clean up
proxiesUsed->getInterface(alternativeChosen)
.expireIdempotencyId.send(ExpireIdempotencyIdRequest{
ci.version, uint8_t(ci.txnBatchId >> 8), trState->getTenantInfo() });
}
return Void();
} else {
// clear the RYW transaction which contains previous conflicting keys
@ -6948,11 +6983,16 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
throw e;
}
tr.idempotencyId = IdempotencyIdRef(tr.arena, IdempotencyIdRef(value.get()));
trState->automaticIdempotency = false;
break;
case FDBTransactionOptions::AUTOMATIC_IDEMPOTENCY:
validateOptionValueNotPresent(value);
tr.idempotencyId = IdempotencyIdRef(
tr.arena, IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
if (!tr.idempotencyId.valid()) {
tr.idempotencyId = IdempotencyIdRef(
tr.arena,
IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
}
trState->automaticIdempotency = true;
break;
default:
@ -7519,12 +7559,11 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
Optional<Reference<TransactionState>> trState);
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
TenantInfo tenantInfo,
KeyRange keys,
Reference<LocationInfo> locationInfo,
TenantMapEntry tenantEntry,
Optional<Reference<TransactionState>> trState) {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
try {
WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
@ -7533,12 +7572,16 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
cx->invalidateCache(tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
} else if (e.code() == error_code_unknown_tenant && trState.present() &&
tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
wait(trState.get()->handleUnknownTenant());
} else {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(tenantEntry.prefix, keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
return m;
@ -7569,7 +7612,7 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
fx[i] = doGetStorageMetrics(
cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
cx, tenantInfo, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
}
wait(waitForAll(fx));
for (int i = 0; i < nLocs; i++) {
@ -7724,27 +7767,18 @@ ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return x;
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
return Optional<StorageMetrics>();
StorageMetrics x = wait(fx);
return x;
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
@ -7757,9 +7791,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
loop {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
state std::vector<KeyRangeLocationInfo> locations =
wait(getKeyRangeLocations(cx,
tenantInfo,
@ -7789,13 +7823,25 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
continue;
}
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
try {
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
} else if (e.code() == error_code_unknown_tenant && trState.present() &&
tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
wait(trState.get()->handleUnknownTenant());
} else {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -7965,6 +8011,21 @@ ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self,
return tme;
}
// Tenant's are supposed to be unique and therefore can be loaded once.
// There is an assumption that a tenant exists as long as operations are happening against said tenant.
ACTOR Future<TenantMapEntry> blobLoadTenantMapEntry(Database* db, Key rangeStartKey, Optional<TenantName> tenantName) {
state Transaction tr(*db);
loop {
try {
TenantMapEntry tme = wait(blobGranuleGetTenantEntry(&tr, rangeStartKey, tenantName));
return tme;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) {
return ::getRangeSplitPoints(
trState, keys, chunkSize, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion);
@ -8436,7 +8497,6 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
state Version readVersionOut = invalidVersion;
state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
state bool loadedTenantEntry = false;
if (version.present()) {
if (version.get() == latestVersion) {
@ -8456,16 +8516,16 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
}
}
if (tenantName.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
range = range.withPrefix(tme.prefix);
curRegion = KeyRangeRef(range.begin, range.begin);
}
loop {
if (curRegion.begin >= range.end) {
return readVersionOut;
}
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
curRegion = KeyRangeRef(range.begin, range.begin);
}
loop {
try {
wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize)));
@ -9328,7 +9388,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
mismatchEvent.detail("EndKey", request.range.end);
mismatchEvent.detail("CanReadPopped", request.canReadPopped);
mismatchEvent.detail("PopVersion", popVersion);
mismatchEvent.detail("DebugUID", request.debugUID);
mismatchEvent.detail("DebugUID", request.id);
// mismatch info
mismatchEvent.detail("MatchesFound", matchesFound);
@ -9354,7 +9414,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
"TSSMismatchChangeFeedStream");
summaryEvent.detail("TSSID", tssData.tssId)
.detail("MismatchId", mismatchUID)
.detail("FeedDebugUID", request.debugUID);
.detail("FeedDebugUID", request.id);
}
}
}
@ -9879,7 +9939,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state std::vector<Future<Void>> fetchers(interfs.size());
state std::vector<Future<Void>> onErrors(interfs.size());
state std::vector<MutationAndVersionStream> streams(interfs.size());
@ -9907,10 +9968,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
if (replyBufferSize != -1 && req.replyBufferSize < CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES) {
req.replyBufferSize = CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES;
}
req.debugUID = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.debugUID);
mergeCursorUID =
UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second());
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.id);
mergeCursorUID = UID(mergeCursorUID.first() ^ req.id.first(), mergeCursorUID.second() ^ req.id.second());
results->streams.push_back(interfs[i].first.changeFeedStream.getReplyStream(req));
maybeDuplicateTSSChangeFeedStream(req,
@ -10113,7 +10175,8 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state ChangeFeedStreamRequest req;
state Optional<ChangeFeedTSSValidationData> tssData;
@ -10123,10 +10186,11 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
req.range = range;
req.canReadPopped = canReadPopped;
req.replyBufferSize = replyBufferSize;
req.debugUID = deterministicRandom()->randomUniqueID();
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
if (DEBUG_CF_CLIENT_TRACE) {
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.debugUID)
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.id)
.detail("FeedID", rangeID)
.detail("Range", range)
.detail("Begin", *begin)
@ -10166,7 +10230,8 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state Span span("NAPI:GetChangeFeedStream"_loc);
db->usedAnyChangeFeeds = true;
@ -10256,14 +10321,22 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
}
CODE_PROBE(true, "Change feed merge cursor");
// TODO (jslocum): validate connectionFileChanged behavior
wait(
mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
cx->connectionFileChanged());
wait(mergeChangeFeedStream(
db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped, readOptions) ||
cx->connectionFileChanged());
} else {
CODE_PROBE(true, "Change feed single cursor");
StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
wait(singleChangeFeedStream(
db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
wait(singleChangeFeedStream(db,
interf,
range,
results,
rangeID,
&begin,
end,
replyBufferSize,
canReadPopped,
readOptions) ||
cx->connectionFileChanged());
}
} catch (Error& e) {
@ -10330,9 +10403,17 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
return getChangeFeedStreamActor(
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
bool canReadPopped,
ReadOptions readOptions) {
return getChangeFeedStreamActor(Reference<DatabaseContext>::addRef(this),
results,
rangeID,
begin,
end,
range,
replyBufferSize,
canReadPopped,
readOptions);
}
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
@ -10568,70 +10649,28 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr,
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
state RangeResult results =
wait(krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (!results.more) {
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobbifiedRanges(Transaction* tr,
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state TenantMapEntry tme;
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(tr, range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
break;
} catch (Error& e) {
wait(tr->onError(e));
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
}
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
}
ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
@ -10643,7 +10682,6 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
state Transaction tr(cx);
state Key purgeKey;
state KeyRange purgeRange = range;
state bool loadedTenantPrefix = false;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
if (purgeVersion == latestVersion) {
@ -10663,23 +10701,22 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
throw unsupported_operation();
}
if (tenant.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&cx, range.begin, tenant));
purgeRange = purgeRange.withPrefix(tme.prefix);
}
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
if (tenant.present() && !loadedTenantPrefix) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenant));
loadedTenantPrefix = true;
purgeRange = purgeRange.withPrefix(tenantEntry.prefix);
}
// must be aligned to blob range(s)
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2, {});
getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2);
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
getBlobbifiedRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2, {});
getBlobRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2);
wait(success(blobbifiedBegin) && success(blobbifiedEnd));
if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
@ -10765,7 +10802,11 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state bool loadedTenantEntry = false;
if (tenantName.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
range = range.withPrefix(tme.prefix);
}
state Value value = active ? blobRangeActive : blobRangeInactive;
loop {
@ -10773,13 +10814,6 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry =
wait(blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
}
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));
if (active) {
@ -10831,10 +10865,41 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
state Database db(cx);
state Transaction tr(db);
state TenantMapEntry tme;
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
Standalone<VectorRef<KeyRangeRef>> blobbifiedRanges = wait(getBlobbifiedRanges(&tr, range, rangeLimit, tenantName));
if (tenantName.present()) {
wait(store(tme, blobLoadTenantMapEntry(&db, range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
return blobbifiedRanges;
loop {
try {
wait(store(blobRanges, getBlobRanges(&tr, range, rangeLimit)));
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
if (!tenantName.present()) {
return blobRanges;
}
// Strip tenant prefix out.
state Standalone<VectorRef<KeyRangeRef>> tenantBlobRanges;
for (auto& blobRange : blobRanges) {
// Filter out blob ranges that span tenants for some reason.
if (!blobRange.begin.startsWith(tme.prefix) || !blobRange.end.startsWith(tme.prefix)) {
TraceEvent("ListBlobbifiedRangeSpansTenants")
.suppressFor(/*seconds=*/5)
.detail("Tenant", tenantName.get())
.detail("Range", blobRange);
continue;
}
tenantBlobRanges.push_back_deep(tenantBlobRanges.arena(), blobRange.removePrefix(tme.prefix));
}
return tenantBlobRanges;
}
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,

View File

@ -42,7 +42,7 @@ ACTOR static Future<Void> produce(ParallelStream<ParallelStreamTest::TestValue>:
}
ACTOR static Future<Void> consume(FutureStream<ParallelStreamTest::TestValue> stream, int expected) {
state int next;
state int next = 0;
try {
loop {
ParallelStreamTest::TestValue value = waitNext(stream);

View File

@ -564,6 +564,10 @@ Version PaxosConfigTransaction::getCommittedVersion() const {
return impl->getCommittedVersion();
}
int64_t PaxosConfigTransaction::getTotalCost() const {
return 0;
}
int64_t PaxosConfigTransaction::getApproximateSize() const {
return impl->getApproximateSize();
}

View File

@ -88,6 +88,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
concurrent_lists = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_LISTS;
concurrent_reads_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_READS_PER_FILE;
concurrent_writes_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
enable_read_cache = CLIENT_KNOBS->BLOBSTORE_ENABLE_READ_CACHE;
read_block_size = CLIENT_KNOBS->BLOBSTORE_READ_BLOCK_SIZE;
read_ahead_blocks = CLIENT_KNOBS->BLOBSTORE_READ_AHEAD_BLOCKS;
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
@ -125,6 +126,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(concurrent_lists, cl);
TRY_PARAM(concurrent_reads_per_file, crpf);
TRY_PARAM(concurrent_writes_per_file, cwpf);
TRY_PARAM(enable_read_cache, erc);
TRY_PARAM(read_block_size, rbs);
TRY_PARAM(read_ahead_blocks, rab);
TRY_PARAM(read_cache_blocks_per_file, rcb);
@ -162,6 +164,7 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
_CHECK_PARAM(concurrent_lists, cl);
_CHECK_PARAM(concurrent_reads_per_file, crpf);
_CHECK_PARAM(concurrent_writes_per_file, cwpf);
_CHECK_PARAM(enable_read_cache, erc);
_CHECK_PARAM(read_block_size, rbs);
_CHECK_PARAM(read_ahead_blocks, rab);
_CHECK_PARAM(read_cache_blocks_per_file, rcb);

View File

@ -295,7 +295,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -384,6 +384,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_WRITER_THREAD_PRIORITY, 0 );
init( ROCKSDB_BACKGROUND_PARALLELISM, 4 );
init( ROCKSDB_READ_PARALLELISM, 4 );
// If true, do not process and store RocksDB logs
init( ROCKSDB_MUTE_LOGS, false );
// Use a smaller memtable in simulation to avoid OOMs.
int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
init( ROCKSDB_MEMTABLE_BYTES, memtableBytes );
@ -406,6 +408,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_HISTOGRAMS_SAMPLE_RATE, 0.001 ); if( randomize && BUGGIFY ) ROCKSDB_HISTOGRAMS_SAMPLE_RATE = 0;
init( ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME, 30.0 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME = 0.1;
init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip() ? true : false;
init( ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS, false ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS = deterministicRandom()->coinflip() ? true : false;
init( ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT, 200 );
// Set to 0 to disable rocksdb write rate limiting. Rate limiter unit: bytes per second.
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
@ -423,10 +427,11 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -732,9 +737,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED, 10 );
init( GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER, 240.0 );
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
init( GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL, 60.0 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
@ -763,7 +769,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( SERVE_AUDIT_STORAGE_PARALLELISM, 2 );
init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20;
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
@ -802,6 +807,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -913,7 +922,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 );
init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 );
init( REDWOOD_EXTENT_CONCURRENT_READS, 4 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );
init( REDWOOD_KVSTORE_RANGE_PREFETCH, true );
init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 );
init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 );
@ -926,6 +934,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
@ -940,9 +949,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENCRYPTION_MODE, "AES-256-CTR" );
init( SIM_KMS_MAX_KEYS, 4096 );
init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 );
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true;
init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION;
init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION;
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION ) ENABLE_TLOG_ENCRYPTION = false;
init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_STORAGE_SERVER_ENCRYPTION = false;
init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_BLOB_GRANULE_ENCRYPTION = false;
// encrypt key proxy
init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
@ -996,6 +1005,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_BACKUP, false );
init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 );
init( BLOB_FULL_RESTORE_MODE, false );
init( BLOB_MIGRATOR_CHECK_INTERVAL, isSimulated ? 1.0 : 5.0);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );
@ -1003,8 +1013,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Blob Metadata
init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
init( BLOB_METADATA_REFRESH_INTERVAL, isSimulated ? 60 : 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
// HTTP KMS Connector
init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file");
@ -1025,6 +1033,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// NOTE: 'token-name" can NOT contain '#' character
init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS, "");
// Drop in-memory state associated with an idempotency id after this many seconds. Once dropped, this id cannot be
// expired proactively, but will eventually get cleaned up by the idempotency id cleaner.
init( IDEMPOTENCY_ID_IN_MEMORY_LIFETIME, 10);
// clang-format on
if (clientKnobs) {

View File

@ -296,6 +296,10 @@ Version SimpleConfigTransaction::getCommittedVersion() const {
return impl->getCommittedVersion();
}
int64_t SimpleConfigTransaction::getTotalCost() const {
return 0;
}
int64_t SimpleConfigTransaction::getApproximateSize() const {
return impl->getApproximateSize();
}

View File

@ -284,8 +284,6 @@ const KeyRangeRef readConflictRangeKeysRange =
const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr,
"\xff\xff/transaction/write_conflict_range/\xff\xff"_sr);
const KeyRef clusterIdKey = "\xff/clusterId"_sr;
const KeyRangeRef auditRange = KeyRangeRef("\xff/audit/"_sr, "\xff/audit0"_sr);
const KeyRef auditPrefix = auditRange.begin;
@ -1074,6 +1072,11 @@ const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02
const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr;
const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr;
// Durable cluster ID key. Added "Key" to the end to differentiate from the key
// "\xff/clusterId" which was stored in the txnStateStore in FDB 7.1, whereas
// this key is stored in the database in 7.2+.
const KeyRef clusterIdKey = "\xff/clusterIdKey"_sr;
// Backup Log Mutation constant variables
const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr;
const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr);
@ -1810,4 +1813,4 @@ TEST_CASE("noSim/SystemData/compat/KeyServers") {
printf("ssi serdes test complete\n");
return Void();
}
}

View File

@ -626,6 +626,14 @@ ThreadFuture<SpanContext> ThreadSafeTransaction::getSpanContext() {
});
}
ThreadFuture<int64_t> ThreadSafeTransaction::getTotalCost() {
ISingleThreadTransaction* tr = this->tr;
return onMainThread([tr]() -> Future<int64_t> {
tr->checkDeferredError();
return tr->getTotalCost();
});
}
ThreadFuture<int64_t> ThreadSafeTransaction::getApproximateSize() {
ISingleThreadTransaction* tr = this->tr;
return onMainThread([tr]() -> Future<int64_t> {
@ -735,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
Optional<Error> runErr;
try {
::runNetwork();
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e;
} catch (std::exception& e) {
} catch (const std::exception& e) {
runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
@ -749,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
for (auto& hook : threadCompletionHooks) {
try {
hook.first(hook.second);
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) {
} catch (const std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@ -759,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
}
if (runErr.present()) {
closeTraceFile();
throw runErr.get();
}
TraceEvent("RunNetworkTerminating");
closeTraceFile();
}
void ThreadSafeApi::stopNetwork() {

View File

@ -103,6 +103,7 @@ public:
Counter latestCipherKeyCacheNeedsRefresh;
LatencySample getCipherKeysLatency;
LatencySample getLatestCipherKeysLatency;
LatencySample getBlobMetadataLatency;
std::array<CounterSet, int(UsageType::MAX)> counterSets;
};

View File

@ -91,4 +91,8 @@ struct BlobMetadataDetailsRef {
}
};
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName);
#endif

View File

@ -22,7 +22,7 @@
#define FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#pragma once

View File

@ -235,6 +235,7 @@ public:
int BLOBSTORE_CONCURRENT_LISTS;
int BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
int BLOBSTORE_CONCURRENT_READS_PER_FILE;
int BLOBSTORE_ENABLE_READ_CACHE;
int BLOBSTORE_READ_BLOCK_SIZE;
int BLOBSTORE_READ_AHEAD_BLOCKS;
int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
@ -262,8 +263,8 @@ public:
double TAG_THROTTLE_EXPIRATION_INTERVAL;
int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations
int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
// being rejected
// Cost multiplier for writes (because write operations are more expensive than reads):
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
// busyness reporting
double BUSYNESS_SPIKE_START_THRESHOLD;
@ -272,6 +273,7 @@ public:
// Blob Granules
int BG_MAX_GRANULE_PARALLELISM;
int BG_TOO_MANY_GRANULES;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file.
// This might happen when a recovery is happening together with a cluster controller coordinator key change.

View File

@ -30,7 +30,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbclient/GrvProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/VersionVector.h"
@ -61,6 +61,7 @@ struct CommitProxyInterface {
RequestStream<struct ProxySnapRequest> proxySnapReq;
RequestStream<struct ExclusionSafetyCheckRequest> exclusionSafetyCheckReq;
RequestStream<struct GetDDMetricsRequest> getDDMetrics;
PublicRequestStream<struct ExpireIdempotencyIdRequest> expireIdempotencyId;
UID id() const { return commit.getEndpoint().token; }
std::string toString() const { return id().shortString(); }
@ -87,6 +88,8 @@ struct CommitProxyInterface {
exclusionSafetyCheckReq =
RequestStream<struct ExclusionSafetyCheckRequest>(commit.getEndpoint().getAdjustedEndpoint(8));
getDDMetrics = RequestStream<struct GetDDMetricsRequest>(commit.getEndpoint().getAdjustedEndpoint(9));
expireIdempotencyId =
PublicRequestStream<struct ExpireIdempotencyIdRequest>(commit.getEndpoint().getAdjustedEndpoint(10));
}
}
@ -103,6 +106,7 @@ struct CommitProxyInterface {
streams.push_back(proxySnapReq.getReceiver());
streams.push_back(exclusionSafetyCheckReq.getReceiver());
streams.push_back(getDDMetrics.getReceiver());
streams.push_back(expireIdempotencyId.getReceiver());
FlowTransport::transport().addEndpoints(streams);
}
};
@ -151,6 +155,24 @@ struct ClientDBInfo {
}
};
struct ExpireIdempotencyIdRequest {
constexpr static FileIdentifier file_identifier = 1900933;
Version commitVersion = invalidVersion;
uint8_t batchIndexHighByte = 0;
TenantInfo tenant;
ExpireIdempotencyIdRequest() {}
ExpireIdempotencyIdRequest(Version commitVersion, uint8_t batchIndexHighByte, TenantInfo tenant)
: commitVersion(commitVersion), batchIndexHighByte(batchIndexHighByte), tenant(tenant) {}
bool verify() const { return tenant.isAuthorized(); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, commitVersion, batchIndexHighByte, tenant);
}
};
struct CommitID {
constexpr static FileIdentifier file_identifier = 14254927;
Version version; // returns invalidVersion if transaction conflicts

View File

@ -382,7 +382,8 @@ public:
Version end = std::numeric_limits<Version>::max(),
KeyRange range = allKeys,
int replyBufferSize = -1,
bool canReadPopped = true);
bool canReadPopped = true,
ReadOptions readOptions = { ReadType::NORMAL, CacheResult::False });
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);

View File

@ -546,29 +546,21 @@ struct hash<KeyRange> {
enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() };
inline Key keyAfter(const KeyRef& key) {
if (key == "\xff\xff"_sr)
return key;
Standalone<StringRef> r;
uint8_t* s = new (r.arena()) uint8_t[key.size() + 1];
if (key.size() > 0) {
memcpy(s, key.begin(), key.size());
}
s[key.size()] = 0;
((StringRef&)r) = StringRef(s, key.size() + 1);
return r;
}
inline KeyRef keyAfter(const KeyRef& key, Arena& arena) {
if (key == "\xff\xff"_sr)
return key;
// Don't include fdbclient/SystemData.h for the allKeys symbol to avoid a cyclic include
static const auto allKeysEnd = "\xff\xff"_sr;
if (key == allKeysEnd) {
return allKeysEnd;
}
uint8_t* t = new (arena) uint8_t[key.size() + 1];
memcpy(t, key.begin(), key.size());
t[key.size()] = 0;
return KeyRef(t, key.size() + 1);
}
inline KeyRange singleKeyRange(const KeyRef& a) {
return KeyRangeRef(a, keyAfter(a));
inline Key keyAfter(const KeyRef& key) {
Key result;
result.contents() = keyAfter(key, result.arena());
return result;
}
inline KeyRangeRef singleKeyRange(KeyRef const& key, Arena& arena) {
uint8_t* t = new (arena) uint8_t[key.size() + 1];
@ -576,6 +568,11 @@ inline KeyRangeRef singleKeyRange(KeyRef const& key, Arena& arena) {
t[key.size()] = 0;
return KeyRangeRef(KeyRef(t, key.size()), KeyRef(t, key.size() + 1));
}
inline KeyRange singleKeyRange(const KeyRef& a) {
KeyRange result;
result.contents() = singleKeyRange(a, result.arena());
return result;
}
inline KeyRange prefixRange(KeyRef prefix) {
Standalone<KeyRangeRef> range;
KeyRef start = KeyRef(range.arena(), prefix);
@ -1494,7 +1491,7 @@ struct EncryptionAtRestMode {
bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); }
bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); }
static EncryptionAtRestMode fromValue(Optional<ValueRef> val) {
static EncryptionAtRestMode fromValueRef(Optional<ValueRef> val) {
if (!val.present()) {
return DISABLED;
}
@ -1508,6 +1505,14 @@ struct EncryptionAtRestMode {
return static_cast<Mode>(num);
}
static EncryptionAtRestMode fromValue(Optional<Value> val) {
if (!val.present()) {
return EncryptionAtRestMode();
}
return EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(val.get().contents()));
}
uint32_t mode;
};
@ -1635,13 +1640,7 @@ struct StorageWiggleValue {
}
};
enum class ReadType {
EAGER,
FETCH,
LOW,
NORMAL,
HIGH,
};
enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH };
FDB_DECLARE_BOOLEAN_PARAM(CacheResult);
@ -1657,14 +1656,14 @@ struct ReadOptions {
Optional<UID> debugID;
Optional<Version> consistencyCheckStartVersion;
ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){};
ReadOptions(Optional<UID> debugID,
ReadOptions(Optional<UID> debugID = Optional<UID>(),
ReadType type = ReadType::NORMAL,
CacheResult cache = CacheResult::False,
CacheResult cache = CacheResult::True,
Optional<Version> version = Optional<Version>())
: type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){};
ReadOptions(ReadType type, CacheResult cache = CacheResult::True) : ReadOptions({}, type, cache) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion);

View File

@ -284,7 +284,6 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
state bool oldReplicationUsesDcId = false;
state bool warnPPWGradual = false;
state bool warnChangeStorageNoMigrate = false;
state bool warnRocksDBIsExperimental = false;
state bool warnShardedRocksDBIsExperimental = false;
loop {

View File

@ -120,6 +120,7 @@ public:
// later if they are not really needed.
virtual ThreadFuture<VersionVector> getVersionVector() = 0;
virtual ThreadFuture<SpanContext> getSpanContext() = 0;
virtual ThreadFuture<int64_t> getTotalCost() = 0;
virtual ThreadFuture<int64_t> getApproximateSize() = 0;
virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;

View File

@ -101,6 +101,7 @@ public:
virtual Version getCommittedVersion() const = 0;
virtual VersionVector getVersionVector() const = 0;
virtual SpanContext getSpanContext() const = 0;
virtual int64_t getTotalCost() const = 0;
virtual int64_t getApproximateSize() const = 0;
virtual Future<Standalone<StringRef>> getVersionstamp() = 0;
virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.h
* IdempotencyId.actor.h
*
* This source file is part of the FoundationDB open source project
*
@ -18,8 +18,13 @@
* limitations under the License.
*/
#ifndef FDBCLIENT_IDEMPOTENCYID_H
#define FDBCLIENT_IDEMPOTENCYID_H
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H
#include "fdbclient/IdempotencyId.actor.g.h"
#elif !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H
#pragma once
@ -28,12 +33,24 @@
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct CommitResult {
Version commitVersion;
uint16_t batchIndex;
};
// The type of the value stored at the key |idempotencyIdsExpiredVersion|
struct IdempotencyIdsExpiredVersion {
static constexpr auto file_identifier = 3746945;
Version expired = 0;
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, expired);
}
};
// See design/idempotency_ids.md for more information. Designed so that the common case of a random 16 byte id does not
// usually require indirection. Either invalid or an id with length >= 16 and < 256.
struct IdempotencyIdRef {
@ -163,4 +180,10 @@ private:
// Check if id is present in kv, and if so return the commit version and batchIndex
Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id);
#endif
// Make a range containing only the idempotency key associated with version and highOrderBatchIndex
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex);
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex);
#include "flow/unactorcompiler.h"
#endif

View File

@ -377,6 +377,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
FDBFuture* (*transactionCommit)(FDBTransaction* tr);
fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion);
FDBFuture* (*transactionGetTotalCost)(FDBTransaction* tr);
FDBFuture* (*transactionGetApproximateSize)(FDBTransaction* tr);
FDBFuture* (*transactionWatch)(FDBTransaction* tr, uint8_t const* keyName, int keyNameLength);
FDBFuture* (*transactionOnError)(FDBTransaction* tr, fdb_error_t error);
@ -505,6 +506,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override { return SpanContext(); };
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -732,6 +734,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override;
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -1024,6 +1027,7 @@ public:
ThreadFuture<Void> protocolVersionMonitor;
Future<Void> sharedStateUpdater;
bool isConfigDB;
// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
// there are various issues that result in negative behavior in some cases if the connections are closed.

View File

@ -249,6 +249,9 @@ struct TransactionState : ReferenceCounted<TransactionState> {
SpanContext spanContext;
UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False;
bool readVersionObtainedFromGrvProxy;
// Measured by summing the bytes accessed by each read and write operation
// after rounding up to the nearest page size and applying a write penalty
int64_t totalCost = 0;
// Special flag to skip prepending tenant prefix to mutations and conflict ranges
// when a dummy, internal transaction gets commited. The sole purpose of commitDummyTransaction() is to
@ -268,6 +271,8 @@ struct TransactionState : ReferenceCounted<TransactionState> {
// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;
bool automaticIdempotency = false;
// Only available so that Transaction can have a default constructor, for use in state variables
TransactionState(TaskPriority taskID, SpanContext spanContext)
: taskID(taskID), spanContext(spanContext), tenantSet(false) {}
@ -445,6 +450,8 @@ public:
// May be called only after commit() returns success
Version getCommittedVersion() const { return trState->committedVersion; }
int64_t getTotalCost() const { return trState->totalCost; }
// Will be fulfilled only after commit() returns success
[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp();
@ -482,6 +489,7 @@ public:
Database getDatabase() const { return trState->cx; }
static Reference<TransactionLogInfo> createTrLogInfoProbabilistically(const Database& cx);
Transaction& getTransaction() { return *this; }
void setTransactionID(UID id);
void setToken(uint64_t token);
@ -563,9 +571,16 @@ ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
// Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion> exclusions);
// Round up to the nearest page size
// Measured in bytes, rounded up to the nearest page size. Multiply by fungibility ratio
// because writes are more expensive than reads.
inline uint64_t getWriteOperationCost(uint64_t bytes) {
return (bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1;
return CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR *
((bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1);
}
// Measured in bytes, rounded up to the nearest page size.
inline uint64_t getReadOperationCost(uint64_t bytes) {
return ((bytes - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
}
// Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value

View File

@ -64,6 +64,7 @@ public:
void clear(KeyRef const&) override;
Future<Void> commit() override;
Version getCommittedVersion() const override;
int64_t getTotalCost() const override;
int64_t getApproximateSize() const override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
Future<Void> onError(Error const& e) override;

View File

@ -149,6 +149,7 @@ public:
VersionVector getVersionVector() const override { return tr.getVersionVector(); }
SpanContext getSpanContext() const override { return tr.getSpanContext(); }
int64_t getTotalCost() const override { return tr.getTotalCost(); }
int64_t getApproximateSize() const override { return approximateSize; }
[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp() override;

View File

@ -58,8 +58,8 @@ public:
requests_per_second, list_requests_per_second, write_requests_per_second, read_requests_per_second,
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second,
max_recv_bytes_per_second, sdk_auth;
enable_read_cache, read_block_size, read_ahead_blocks, read_cache_blocks_per_file,
max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth;
bool set(StringRef name, int value);
std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() {
@ -86,6 +86,7 @@ public:
"concurrent_lists (or cl) Max concurrent list operations that can be in progress at once.",
"concurrent_reads_per_file (or crps) Max concurrent reads in progress for any one file.",
"concurrent_writes_per_file (or cwps) Max concurrent uploads in progress for any one file.",
"enable_read_cache (or erc) Whether read block caching is enabled.",
"read_block_size (or rbs) Block size in bytes to be used for reads.",
"read_ahead_blocks (or rab) Number of blocks to read ahead of requested offset.",
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",

View File

@ -318,6 +318,7 @@ public:
int64_t ROCKSDB_MEMTABLE_BYTES;
bool ROCKSDB_LEVEL_STYLE_COMPACTION;
bool ROCKSDB_UNSAFE_AUTO_FSYNC;
bool ROCKSDB_MUTE_LOGS;
int64_t ROCKSDB_PERIODIC_COMPACTION_SECONDS;
int ROCKSDB_PREFIX_LEN;
int64_t ROCKSDB_BLOCK_CACHE_SIZE;
@ -335,6 +336,8 @@ public:
double ROCKSDB_HISTOGRAMS_SAMPLE_RATE;
double ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME;
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
bool ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS;
int ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
@ -351,6 +354,7 @@ public:
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
bool ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;
@ -630,14 +634,16 @@ public:
double GLOBAL_TAG_THROTTLING_MIN_RATE;
// Used by global tag throttling counters
double GLOBAL_TAG_THROTTLING_FOLDING_TIME;
// Cost multiplier for writes (because write operations are more expensive than reads)
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
// Maximum number of tags tracked by global tag throttler. Additional tags will be ignored
// until some existing tags expire
int64_t GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED;
// Global tag throttler forgets about throughput from a tag once no new transactions from that
// tag have been received for this duration (in seconds):
int64_t GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER;
// Maximum duration that a transaction can be tag throttled by proxy before being rejected
double PROXY_MAX_TAG_THROTTLE_DURATION;
// Interval at which latency bands are logged for each tag on grv proxy
double GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL;
double MAX_TRANSACTIONS_PER_BYTE;
@ -719,7 +725,6 @@ public:
int FETCH_KEYS_LOWER_PRIORITY;
int SERVE_FETCH_CHECKPOINT_PARALLELISM;
int SERVE_AUDIT_STORAGE_PARALLELISM;
int CHANGE_FEED_DISK_READS_PARALLELISM;
int BUGGIFY_BLOCK_BYTES;
int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
@ -742,7 +747,6 @@ public:
int64_t MIN_TAG_READ_PAGES_RATE;
int64_t MIN_TAG_WRITE_PAGES_RATE;
double TAG_MEASUREMENT_INTERVAL;
int64_t READ_COST_BYTE_FACTOR;
bool PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS;
bool REPORT_DD_METRICS;
double DD_METRICS_REPORT_INTERVAL;
@ -759,6 +763,9 @@ public:
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -888,7 +895,6 @@ public:
int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files
int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files
int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress.
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
bool REDWOOD_KVSTORE_RANGE_PREFETCH; // Whether to use range read prefetching
double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
@ -908,6 +914,8 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;
double LATENCY_METRICS_LOGGING_INTERVAL;
@ -978,10 +986,10 @@ public:
bool BLOB_MANIFEST_BACKUP;
double BLOB_MANIFEST_BACKUP_INTERVAL;
bool BLOB_FULL_RESTORE_MODE;
double BLOB_MIGRATOR_CHECK_INTERVAL;
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// HTTP KMS Connector
std::string REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE;
@ -995,6 +1003,9 @@ public:
std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT;
std::string REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT;
// Idempotency ids
double IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated);
};

View File

@ -76,6 +76,7 @@ public:
void reset() override;
void debugTransaction(UID dID) override;
void checkDeferredError() const override;
int64_t getTotalCost() const override;
int64_t getApproximateSize() const override;
void set(KeyRef const&, ValueRef const&) override;
void clear(KeyRangeRef const&) override { throw client_invalid_operation(); }

View File

@ -45,7 +45,7 @@ struct CheckpointMetaData {
constexpr static FileIdentifier file_identifier = 13804342;
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int16_t format; // CheckpointFormat.
UID ssID; // Storage server ID on which this checkpoint is created.
UID checkpointID; // A unique id for this checkpoint.
@ -58,11 +58,15 @@ struct CheckpointMetaData {
CheckpointMetaData() = default;
CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
: version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(invalidVersion), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {
this->ranges.push_back(range);
}
CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
: version(version), range(range), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(version), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending), referenceCount(0),
gcTime(0) {
this->ranges.push_back(range);
}
CheckpointState getState() const { return static_cast<CheckpointState>(state); }
@ -73,7 +77,7 @@ struct CheckpointMetaData {
void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }
std::string toString() const {
std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
std::string res = "Checkpoint MetaData:\nRange: " + describe(ranges) + "\nVersion: " + std::to_string(version) +
"\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
"\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
"\n";
@ -82,7 +86,7 @@ struct CheckpointMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
serializer(ar, version, ranges, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
}
};
@ -99,23 +103,28 @@ struct DataMoveMetaData {
constexpr static FileIdentifier file_identifier = 13804362;
UID id; // A unique id for this data move.
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int priority;
std::set<UID> src;
std::set<UID> dest;
std::set<UID> checkpoints;
int16_t phase; // DataMoveMetaData::Phase.
int8_t mode;
DataMoveMetaData() = default;
DataMoveMetaData(UID id, Version version, KeyRange range)
: id(id), version(version), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, Version version, KeyRange range) : id(id), version(version), priority(0), mode(0) {
this->ranges.push_back(range);
}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), priority(0), mode(0) {
this->ranges.push_back(range);
}
Phase getPhase() const { return static_cast<Phase>(phase); }
void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }
std::string toString() const {
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + range.toString() +
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
" [Phase]: " + std::to_string(static_cast<int>(phase)) +
" [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
return res;
@ -123,7 +132,7 @@ struct DataMoveMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, version, range, phase, src, dest);
serializer(ar, id, version, ranges, priority, src, dest, checkpoints, phase, mode);
}
};

View File

@ -890,16 +890,16 @@ struct ChangeFeedStreamRequest {
KeyRange range;
int replyBufferSize = -1;
bool canReadPopped = true;
UID debugUID; // This is only used for debugging and tracing, but being able to link a client + server side stream
// is so useful for testing, and this is such small overhead compared to streaming large amounts of
// change feed data, it is left in the interface
UID id; // This must be globally unique among ChangeFeedStreamRequest instances
Optional<ReadOptions> options;
ReplyPromiseStream<ChangeFeedStreamReply> reply;
ChangeFeedStreamRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, debugUID, arena);
serializer(
ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, id, options, arena);
}
};

View File

@ -92,8 +92,6 @@ void decodeKeyServersValue(RangeResult result,
UID& destID,
bool missingIsError = true);
extern const KeyRef clusterIdKey;
extern const KeyRangeRef auditRange;
extern const KeyRef auditPrefix;
const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key);
@ -505,6 +503,9 @@ extern const KeyRangeRef timeKeeperPrefixRange;
extern const KeyRef timeKeeperVersionKey;
extern const KeyRef timeKeeperDisableKey;
// Durable cluster ID key
extern const KeyRef clusterIdKey;
// Layer status metadata prefix
extern const KeyRangeRef layerStatusMetaPrefixRange;

View File

@ -211,6 +211,31 @@ struct TenantMetadata {
};
typedef VersionedMap<TenantName, TenantMapEntry> TenantMap;
class TenantPrefixIndex : public VersionedMap<Key, TenantName>, public ReferenceCounted<TenantPrefixIndex> {};
// A set of tenant names that is generally expected to have one item in it. The set can have more than one item in it
// during certain periods when the set is being updated (e.g. while restoring a backup), but it is expected to have
// one item at the end. It is not possible to use the set while it contains more than one item.
struct TenantNameUniqueSet {
std::unordered_set<TenantName> tenantNames;
// Returns the single tenant name stored in the set
// It is an error to call this function if the set holds more than one name
TenantName get() const {
ASSERT(tenantNames.size() == 1);
return *tenantNames.begin();
}
void insert(TenantName const& name) { tenantNames.insert(name); }
// Removes a tenant name from the set. Returns true if the set is now empty.
bool remove(TenantName const& name) {
auto itr = tenantNames.find(name);
ASSERT(itr != tenantNames.end());
tenantNames.erase(itr);
return tenantNames.empty();
}
};
class TenantPrefixIndex : public VersionedMap<Key, TenantNameUniqueSet>, public ReferenceCounted<TenantPrefixIndex> {};
#endif

View File

@ -205,6 +205,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override;
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
ThreadFuture<uint64_t> getProtocolVersion();

View File

@ -279,7 +279,7 @@ description is not currently required but encouraged.
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use. Unless the automatic_idempotency option is set after this option, the client will not automatically attempt to remove this id from the cluster after a successful commit."
hidden="true" />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."

View File

@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
strip_debug_symbols(fdbmonitor)
assert_no_version_h(fdbmonitor)
if(UNIX AND NOT APPLE)
target_link_libraries(fdbmonitor PRIVATE rt)
target_link_libraries(fdbmonitor PRIVATE rt)
endif()
# FIXME: This include directory is an ugly hack. We probably want to fix this.
# as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
# appears to change its behavior (it no longer seems to restart killed
# processes). fdbmonitor is single-threaded anyway.
get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
endif ()
endif()
get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
endif ()
endif()
if(GENERATE_DEBUG_PACKAGES)
fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox
add_custom_target(start_sandbox
COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
add_dependencies(start_sandbox fdbmonitor fdbserver)
@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
endif()
add_custom_target(generate_profile
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)

View File

@ -133,3 +133,50 @@ Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
return CounterCollectionImpl::traceCounters(
this, traceEventName, traceEventID, interval, trackLatestName, decorator);
}
void LatencyBands::insertBand(double value) {
bands.emplace(std::make_pair(value, std::make_unique<Counter>(format("Band%f", value), *cc)));
}
FDB_DEFINE_BOOLEAN_PARAM(Filtered);
LatencyBands::LatencyBands(std::string const& name,
UID id,
double loggingInterval,
std::function<void(TraceEvent&)> const& decorator)
: name(name), id(id), loggingInterval(loggingInterval), decorator(decorator) {}
void LatencyBands::addThreshold(double value) {
if (value > 0 && bands.count(value) == 0) {
if (bands.size() == 0) {
ASSERT(!cc && !filteredCount);
cc = std::make_unique<CounterCollection>(name, id.toString());
logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name, decorator);
filteredCount = std::make_unique<Counter>("Filtered", *cc);
insertBand(std::numeric_limits<double>::infinity());
}
insertBand(value);
}
}
void LatencyBands::addMeasurement(double measurement, int count, Filtered filtered) {
if (filtered && filteredCount) {
(*filteredCount) += count;
} else if (bands.size() > 0) {
auto itr = bands.upper_bound(measurement);
ASSERT(itr != bands.end());
(*itr->second) += count;
}
}
void LatencyBands::clearBands() {
logger = Void();
bands.clear();
filteredCount.reset();
cc.reset();
}
LatencyBands::~LatencyBands() {
clearBands();
}

View File

@ -757,12 +757,18 @@ Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalanc
Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*);
// A simpler version of LoadBalance that does not send second requests where the list of servers are always fresh
//
// If |alternativeChosen| is not null, then atMostOnce must be True, and if the returned future completes successfully
// then *alternativeChosen will be the alternative to which the message was sent. *alternativeChosen must outlive the
// returned future.
ACTOR template <class Interface, class Request, class Multi, bool P>
Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> alternatives,
RequestStream<Request, P> Interface::*channel,
Request request = Request(),
TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
AtMostOnce atMostOnce = AtMostOnce::False) {
AtMostOnce atMostOnce = AtMostOnce::False,
int* alternativeChosen = nullptr) {
ASSERT(alternativeChosen == nullptr || atMostOnce == AtMostOnce::True);
setReplyPriority(request, taskID);
if (!alternatives)
return Never();
@ -791,6 +797,9 @@ Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> al
useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size();
stream = &alternatives->get(useAlt, channel);
if (alternativeChosen != nullptr) {
*alternativeChosen = useAlt;
}
if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed)
break;
nextAlt = (nextAlt + 1) % alternatives->size();

View File

@ -182,47 +182,12 @@ static void specialCounter(CounterCollection& collection, std::string const& nam
new SpecialCounter<F>(collection, name, std::move(f));
}
FDB_DECLARE_BOOLEAN_PARAM(Filtered);
class LatencyBands {
public:
LatencyBands(std::string name, UID id, double loggingInterval)
: name(name), id(id), loggingInterval(loggingInterval) {}
void addThreshold(double value) {
if (value > 0 && bands.count(value) == 0) {
if (bands.size() == 0) {
ASSERT(!cc && !filteredCount);
cc = std::make_unique<CounterCollection>(name, id.toString());
logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name);
filteredCount = std::make_unique<Counter>("Filtered", *cc);
insertBand(std::numeric_limits<double>::infinity());
}
insertBand(value);
}
}
void addMeasurement(double measurement, bool filtered = false) {
if (filtered && filteredCount) {
++(*filteredCount);
} else if (bands.size() > 0) {
auto itr = bands.upper_bound(measurement);
ASSERT(itr != bands.end());
++(*itr->second);
}
}
void clearBands() {
logger = Void();
bands.clear();
filteredCount.reset();
cc.reset();
}
~LatencyBands() { clearBands(); }
private:
std::map<double, std::unique_ptr<Counter>> bands;
std::unique_ptr<Counter> filteredCount;
std::function<void(TraceEvent&)> decorator;
std::string name;
UID id;
@ -231,9 +196,22 @@ private:
std::unique_ptr<CounterCollection> cc;
Future<Void> logger;
void insertBand(double value) {
bands.emplace(std::make_pair(value, std::make_unique<Counter>(format("Band%f", value), *cc)));
}
void insertBand(double value);
public:
LatencyBands(
std::string const& name,
UID id,
double loggingInterval,
std::function<void(TraceEvent&)> const& decorator = [](auto&) {});
LatencyBands(LatencyBands&&) = default;
LatencyBands& operator=(LatencyBands&&) = default;
void addThreshold(double value);
void addMeasurement(double measurement, int count = 1, Filtered = Filtered::False);
void clearBands();
~LatencyBands();
};
class LatencySample {

View File

@ -734,6 +734,7 @@ public:
// If cancelled, request was or will be delivered zero or more times.
template <class X>
Future<REPLY_TYPE(X)> getReply(const X& value) const {
// Ensure the same request isn't used multiple times
ASSERT(!getReplyPromise(value).getFuture().isReady());
if (queue->isRemoteEndpoint()) {
return sendCanceler(getReplyPromise(value),

View File

@ -54,6 +54,7 @@ public:
FailDisk,
RebootAndDelete,
RebootProcessAndDelete,
RebootProcessAndSwitch,
Reboot,
RebootProcess,
None
@ -104,6 +105,7 @@ public:
bool excluded;
bool cleared;
bool rebooting;
bool drProcess;
std::vector<flowGlobalType> globals;
INetworkConnections* network;
@ -128,8 +130,8 @@ public:
const char* coordinationFolder)
: name(name), coordinationFolder(coordinationFolder), dataFolder(dataFolder), machine(nullptr),
addresses(addresses), address(addresses.address), locality(locality), startingClass(startingClass),
failed(false), excluded(false), cleared(false), rebooting(false), network(net), fault_injection_r(0),
fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
failed(false), excluded(false), cleared(false), rebooting(false), drProcess(false), network(net),
fault_injection_r(0), fault_injection_p1(0), fault_injection_p2(0), failedDisk(false) {
uid = deterministicRandom()->randomUniqueID();
}
@ -283,7 +285,8 @@ public:
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) = 0;
ProtocolVersion protocol,
bool drProcess) = 0;
virtual void killProcess(ProcessInfo* machine, KillType) = 0;
virtual void rebootProcess(Optional<Standalone<StringRef>> zoneId, bool allProcesses) = 0;
virtual void rebootProcess(ProcessInfo* process, KillType kt) = 0;
@ -304,6 +307,7 @@ public:
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
// virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses,
@ -390,6 +394,13 @@ public:
return clearedAddresses.find(address) != clearedAddresses.end();
}
void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
bool hasSwitchedCluster(NetworkAddress const& address) const {
return switchedCluster.find(address) != switchedCluster.end() ? switchedCluster.at(address) : false;
}
void toggleGlobalSwitchCluster() { globalSwitchedCluster = !globalSwitchedCluster; }
bool globalHasSwitchedCluster() const { return globalSwitchedCluster; }
void excludeAddress(NetworkAddress const& address) {
excludedAddresses[address]++;
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
@ -540,6 +551,8 @@ private:
std::set<Optional<Standalone<StringRef>>> swapsDisabled;
std::map<NetworkAddress, int> excludedAddresses;
std::map<NetworkAddress, int> clearedAddresses;
std::map<NetworkAddress, bool> switchedCluster;
bool globalSwitchedCluster = false;
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
std::map<std::string, double> disabledMap;
bool allSwapsDisabled;

View File

@ -1261,7 +1261,8 @@ public:
ProcessClass startingClass,
const char* dataFolder,
const char* coordinationFolder,
ProtocolVersion protocol) override {
ProtocolVersion protocol,
bool drProcess) override {
ASSERT(locality.machineId().present());
MachineInfo& machine = machines[locality.machineId().get()];
if (!machine.machineId.present())
@ -1311,6 +1312,7 @@ public:
m->excluded = g_simulator->isExcluded(NetworkAddress(ip, port, true, false));
m->cleared = g_simulator->isCleared(addresses.address);
m->protocolVersion = protocol;
m->drProcess = drProcess;
m->setGlobal(enTDMetrics, (flowGlobalType)&m->tdmetrics);
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
@ -1324,7 +1326,8 @@ public:
.detail("Address", m->address)
.detail("MachineId", m->locality.machineId())
.detail("Excluded", m->excluded)
.detail("Cleared", m->cleared);
.detail("Cleared", m->cleared)
.detail("DrProcess", m->drProcess);
if (std::string(name) == "remote flow process") {
protectedAddresses.insert(m->address);
@ -1407,6 +1410,7 @@ public:
for (auto processInfo : getAllProcesses()) {
if (currentDcId != processInfo->locality.dcId() || // skip other dc
processInfo->startingClass != ProcessClass::BlobWorkerClass || // skip non blob workers
processInfo->failed || // if process was killed but has not yet been removed from the process list
processInfo->locality.machineId() == machineId) { // skip current machine
continue;
}
@ -1794,6 +1798,15 @@ public:
}
return result;
}
bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
bool result = false;
for (auto& machine : machines) {
if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
result = true;
}
}
return result;
}
bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt,
bool forceKill,
@ -1816,6 +1829,7 @@ public:
}
int processesOnMachine = 0;
bool isMainCluster = true; // false for machines running DR processes
KillType originalKt = kt;
// Reboot if any of the processes are protected and count the number of processes not rebooting
@ -1824,6 +1838,9 @@ public:
kt = Reboot;
if (!process->rebooting)
processesOnMachine++;
if (process->drProcess) {
isMainCluster = false;
}
}
// Do nothing, if no processes to kill
@ -1950,8 +1967,13 @@ public:
probe::context::sim2,
probe::assert::simOnly);
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine && kt >= RebootAndDelete) {
if (isMainCluster && originalKt == RebootProcessAndSwitch) {
// When killing processes with the RebootProcessAndSwitch kill
// type, processes in the original cluster should be rebooted in
// order to kill any zombie processes.
kt = KillType::Reboot;
} else if (processesOnMachine != processesPerMachine && kt != RebootProcessAndSwitch) {
// Check if any processes on machine are rebooting
CODE_PROBE(true,
"Attempted reboot, but the target did not have all of its processes running",
probe::context::sim2,
@ -1968,24 +1990,6 @@ public:
return false;
}
// Check if any processes on machine are rebooting
if (processesOnMachine != processesPerMachine) {
CODE_PROBE(true,
"Attempted reboot and kill, but the target did not have all of its processes running",
probe::context::sim2,
probe::assert::simOnly);
TraceEvent(SevWarn, "AbortedKill")
.detail("KillType", kt)
.detail("MachineId", machineId)
.detail("Reason", "Machine processes does not match number of processes per machine")
.detail("Processes", processesOnMachine)
.detail("ProcessesPerMachine", processesPerMachine)
.backtrace();
if (ktFinal)
*ktFinal = None;
return false;
}
TraceEvent("KillMachine")
.detail("MachineId", machineId)
.detail("Kt", kt)
@ -2008,7 +2012,7 @@ public:
if (process->startingClass != ProcessClass::TesterClass)
killProcess_internal(process, kt);
}
} else if (kt == Reboot || kt == RebootAndDelete) {
} else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess")
.detail("KillType", kt)
@ -2564,7 +2568,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
try {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete);
kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);
CODE_PROBE(kt == ISimulator::RebootProcess,
"Simulated process rebooted",
@ -2580,6 +2584,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
"Simulated process rebooted with data and coordination state deletion",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
"Simulated process rebooted with different cluster file",
probe::assert::simOnly,
probe::context::sim2);
if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed")
@ -2608,6 +2616,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
p->cleared = true;
g_simulator->clearAddress(p->address);
} else if (kt == ISimulator::RebootProcessAndSwitch) {
g_simulator->switchCluster(p->address);
}
p->shutdownSignal.send(kt);
} catch (Error& e) {

View File

@ -27,6 +27,7 @@
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/EncryptionOpsUtils.h"
#include "fdbserver/IKeyValueStore.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/LogProtocolMessage.h"
#include "fdbserver/LogSystem.h"
#include "flow/Error.h"
@ -87,9 +88,10 @@ public:
ApplyMetadataMutationsImpl(const SpanContext& spanContext_,
ResolverData& resolverData_,
const VectorRef<MutationRef>& mutations_)
const VectorRef<MutationRef>& mutations_,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* cipherKeys_)
: spanContext(spanContext_), dbgid(resolverData_.dbgid), arena(resolverData_.arena), mutations(mutations_),
txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
cipherKeys(cipherKeys_), txnStateStore(resolverData_.txnStateStore), toCommit(resolverData_.toCommit),
confChange(resolverData_.confChanges), logSystem(resolverData_.logSystem), popVersion(resolverData_.popVersion),
keyInfo(resolverData_.keyInfo), storageCache(resolverData_.storageCache),
initialCommit(resolverData_.initialCommit), forResolver(true) {}
@ -132,7 +134,7 @@ private:
std::unordered_map<UID, StorageServerInterface>* tssMapping = nullptr;
std::map<TenantName, TenantMapEntry>* tenantMap = nullptr;
std::unordered_map<int64_t, TenantName>* tenantIdIndex = nullptr;
std::unordered_map<int64_t, TenantNameUniqueSet>* tenantIdIndex = nullptr;
// true if the mutations were already written to the txnStateStore as part of recovery
bool initialCommit = false;
@ -160,11 +162,13 @@ private:
private:
void writeMutation(const MutationRef& m) {
if (forResolver || !isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
if (!isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION)) {
toCommit->writeTypedMessage(m);
} else {
ASSERT(cipherKeys != nullptr);
Arena arena;
CODE_PROBE(!forResolver, "encrypting metadata mutations");
CODE_PROBE(forResolver, "encrypting resolver mutations");
toCommit->writeTypedMessage(m.encryptMetadata(*cipherKeys, arena, BlobCipherMetrics::TLOG));
}
}
@ -613,7 +617,7 @@ private:
m.param1.startsWith(applyMutationsAddPrefixRange.begin) ||
m.param1.startsWith(applyMutationsRemovePrefixRange.begin) || m.param1.startsWith(tagLocalityListPrefix) ||
m.param1.startsWith(serverTagHistoryPrefix) ||
m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin) || m.param1 == clusterIdKey) {
m.param1.startsWith(testOnlyTxnStateStorePrefixRange.begin)) {
txnStateStore->set(KeyValueRef(m.param1, m.param2));
}
@ -669,7 +673,7 @@ private:
(*tenantMap)[tenantName] = tenantEntry;
if (tenantIdIndex) {
(*tenantIdIndex)[tenantEntry.id] = tenantName;
(*tenantIdIndex)[tenantEntry.id].insert(tenantName);
}
}
@ -799,7 +803,7 @@ private:
.detail("Tag", tag.toString())
.detail("Server", decodeServerTagKey(kv.key));
if (!forResolver) {
logSystem->pop(popVersion, decodeServerTagValue(kv.value));
logSystem->pop(popVersion, tag);
(*tag_popped)[tag] = popVersion;
}
ASSERT_WE_THINK(forResolver ^ (tag_popped != nullptr));
@ -807,11 +811,11 @@ private:
if (toCommit) {
MutationRef privatized = m;
privatized.param1 = kv.key.withPrefix(systemKeys.begin, arena);
privatized.param2 = keyAfter(kv.key, arena).withPrefix(systemKeys.begin, arena);
privatized.param2 = keyAfter(privatized.param1, arena);
TraceEvent(SevDebug, "SendingPrivatized_ClearServerTag", dbgid).detail("M", privatized);
toCommit->addTag(decodeServerTagValue(kv.value));
toCommit->addTag(tag);
writeMutation(privatized);
}
}
@ -1096,7 +1100,11 @@ private:
// TODO: O(n) operation, optimize cpu
auto itr = startItr;
while (itr != endItr) {
tenantIdIndex->erase(itr->second.id);
auto indexItr = tenantIdIndex->find(itr->second.id);
ASSERT(indexItr != tenantIdIndex->end());
if (indexItr->second.remove(itr->first)) {
tenantIdIndex->erase(indexItr);
}
itr++;
}
}
@ -1343,8 +1351,9 @@ void applyMetadataMutations(SpanContext const& spanContext,
void applyMetadataMutations(SpanContext const& spanContext,
ResolverData& resolverData,
const VectorRef<MutationRef>& mutations) {
ApplyMetadataMutationsImpl(spanContext, resolverData, mutations).apply();
const VectorRef<MutationRef>& mutations,
const std::unordered_map<EncryptCipherDomainId, Reference<BlobCipherKey>>* pCipherKeys) {
ApplyMetadataMutationsImpl(spanContext, resolverData, mutations, pCipherKeys).apply();
}
void applyMetadataMutations(SpanContext const& spanContext,

View File

@ -0,0 +1,202 @@
/*
* BlobConnectionProviderTest.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobConnectionProvider.h"
#include "flow/UnitTest.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // has to be last include
void forceLinkBlobConnectionProviderTests() {}
struct ConnectionProviderTestSettings {
uint32_t numProviders;
uint32_t filesPerProvider;
uint32_t maxFileMemory;
uint32_t maxFileSize;
uint32_t threads;
bool uniformProviderChoice;
double readWriteSplit;
double runtime;
int writeOps;
int readOps;
ConnectionProviderTestSettings() {
numProviders = deterministicRandom()->randomSkewedUInt32(1, 1000);
filesPerProvider =
1 + std::min((uint32_t)100, deterministicRandom()->randomSkewedUInt32(10, 10000) / numProviders);
maxFileMemory = 1024 * 1024 * 1024;
maxFileSize = maxFileMemory / (numProviders * filesPerProvider);
maxFileSize = deterministicRandom()->randomSkewedUInt32(8, std::min((uint32_t)(16 * 1024 * 1024), maxFileSize));
threads = deterministicRandom()->randomInt(16, 128);
uniformProviderChoice = deterministicRandom()->coinflip();
readWriteSplit = deterministicRandom()->randomInt(1, 10) / 10.0;
runtime = 60.0;
writeOps = 0;
readOps = 0;
}
};
struct ProviderTestData {
Reference<BlobConnectionProvider> provider;
std::vector<std::pair<std::string, Value>> data;
std::unordered_set<std::string> usedNames;
ProviderTestData() {}
explicit ProviderTestData(Reference<BlobConnectionProvider> provider) : provider(provider) {}
};
ACTOR Future<Void> createObject(ConnectionProviderTestSettings* settings, ProviderTestData* provider) {
// pick object name before wait so no collisions between concurrent writes
std::string objName;
loop {
objName = deterministicRandom()->randomAlphaNumeric(12);
if (provider->usedNames.insert(objName).second) {
break;
}
}
int randomDataSize = deterministicRandom()->randomInt(1, settings->maxFileSize);
state Value data = makeString(randomDataSize);
deterministicRandom()->randomBytes(mutateString(data), randomDataSize);
state Reference<BackupContainerFileSystem> bstore;
state std::string fullPath;
std::tie(bstore, fullPath) = provider->provider->createForWrite(objName);
state Reference<IBackupFile> file = wait(bstore->writeFile(fullPath));
wait(file->append(data.begin(), data.size()));
wait(file->finish());
// after write, put in the readable list
provider->data.push_back({ fullPath, data });
return Void();
}
ACTOR Future<Void> readAndVerifyObject(ProviderTestData* provider, std::string objFullPath, Value expectedData) {
Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
state Reference<IAsyncFile> reader = wait(bstore->readFile(objFullPath));
state Value actualData = makeString(expectedData.size());
int readSize = wait(reader->read(mutateString(actualData), expectedData.size(), 0));
ASSERT_EQ(expectedData.size(), readSize);
ASSERT(expectedData == actualData);
return Void();
}
Future<Void> deleteObject(ProviderTestData* provider, std::string objFullPath) {
Reference<BackupContainerFileSystem> bstore = provider->provider->getForRead(objFullPath);
return bstore->deleteFile(objFullPath);
}
ACTOR Future<Void> workerThread(ConnectionProviderTestSettings* settings, std::vector<ProviderTestData>* providers) {
state double endTime = now() + settings->runtime;
try {
while (now() < endTime) {
// randomly pick provider
int providerIdx;
if (settings->uniformProviderChoice) {
providerIdx = deterministicRandom()->randomInt(0, providers->size());
} else {
providerIdx = deterministicRandom()->randomSkewedUInt32(0, providers->size());
}
ProviderTestData* provider = &(*providers)[providerIdx];
// randomly pick create or read
bool doWrite = deterministicRandom()->random01() < settings->readWriteSplit;
if (provider->usedNames.size() < settings->filesPerProvider && (provider->data.empty() || doWrite)) {
// create an object
wait(createObject(settings, provider));
settings->writeOps++;
} else if (!provider->data.empty()) {
// read a random object
auto& readInfo = provider->data[deterministicRandom()->randomInt(0, provider->data.size())];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
settings->readOps++;
} else {
// other threads are creating files up to filesPerProvider limit, but none finished yet. Just wait
wait(delay(0.1));
}
}
return Void();
} catch (Error& e) {
fmt::print("WorkerThread Unexpected Error {0}\n", e.name());
throw e;
}
}
ACTOR Future<Void> checkAndCleanUp(ProviderTestData* provider) {
state int i;
ASSERT(provider->usedNames.size() == provider->data.size());
for (i = 0; i < provider->data.size(); i++) {
auto& readInfo = provider->data[i];
wait(readAndVerifyObject(provider, readInfo.first, readInfo.second));
wait(deleteObject(provider, provider->data[i].first));
}
return Void();
}
// maybe this should be a workload instead?
TEST_CASE("/fdbserver/blob/connectionprovider") {
state ConnectionProviderTestSettings settings;
state std::vector<ProviderTestData> providers;
providers.reserve(settings.numProviders);
for (int i = 0; i < settings.numProviders; i++) {
std::string nameStr = std::to_string(i);
BlobMetadataDomainName name(nameStr);
auto metadata = createRandomTestBlobMetadata(SERVER_KNOBS->BG_URL, i, name);
providers.emplace_back(BlobConnectionProvider::newBlobConnectionProvider(metadata));
}
fmt::print("BlobConnectionProviderTest\n");
state std::vector<Future<Void>> futures;
futures.reserve(settings.threads);
for (int i = 0; i < settings.threads; i++) {
futures.push_back(workerThread(&settings, &providers));
}
wait(waitForAll(futures));
fmt::print("BlobConnectionProviderTest workload phase complete with {0} files and {1} reads\n",
settings.writeOps,
settings.readOps);
futures.clear();
futures.reserve(providers.size());
for (int i = 0; i < providers.size(); i++) {
futures.push_back(checkAndCleanUp(&providers[i]));
}
wait(waitForAll(futures));
fmt::print("BlobConnectionProviderTest check and cleanup phase complete\n");
return Void();
}

View File

@ -462,7 +462,7 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
}
// FIXME: if one tenant gets an error, don't kill whole process
// TODO: add latency metrics
state double startTime = now();
loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) {
@ -485,6 +485,8 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->updateBStore(metadata);
}
double elapsed = now() - startTime;
BlobCipherMetrics::getInstance()->getBlobMetadataLatency.addMeasurement(elapsed);
return Void();
}
when(wait(self->dbInfo->onChange())) {}

View File

@ -2776,6 +2776,7 @@ ACTOR Future<Void> haltBlobWorker(Reference<BlobManagerData> bmData, BlobWorkerI
if (bmData->iAmReplaced.canBeSet()) {
bmData->iAmReplaced.send(Void());
}
throw;
}
}
@ -2896,6 +2897,7 @@ ACTOR Future<Void> monitorBlobWorkerStatus(Reference<BlobManagerData> bmData, Bl
if (bmData->iAmReplaced.canBeSet()) {
bmData->iAmReplaced.send(Void());
}
throw blob_manager_replaced();
}
BoundaryEvaluation newEval(rep.continueEpoch,
@ -4236,7 +4238,13 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
Version purgeVersion,
KeyRange granuleRange,
Optional<UID> mergeChildID,
bool force) {
bool force,
Future<Void> parentFuture) {
// wait for parent to finish first to avoid ordering/orphaning issues
wait(parentFuture);
// yield to avoid a long callstack and to allow this to get cancelled
wait(delay(0));
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule [{1} - {2}): {3} @ {4}{5}\n",
self->epoch,
@ -4294,6 +4302,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
// deleting files before corresponding metadata reduces the # of orphaned files.
wait(waitForAll(deletions));
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
// delete metadata in FDB (history entry and file keys)
if (BM_PURGE_DEBUG) {
fmt::print(
@ -4329,6 +4342,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
}
}
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: success {2}\n",
self->epoch,
@ -4499,7 +4517,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state std::queue<std::tuple<KeyRange, Version, Version, Optional<UID>>> historyEntryQueue;
// stacks of <granuleId, historyKey> and <granuleId> (and mergeChildID) to track which granules to delete
state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>>> toFullyDelete;
state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>, Version>> toFullyDelete;
state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;
// track which granules we have already added to traversal
@ -4735,7 +4753,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
fmt::print(
"BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
}
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID });
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID, startVersion });
} else if (startVersion < purgeVersion) {
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Granule {1} will be partially deleted\n",
@ -4808,36 +4826,65 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
.detail("DeletingFullyCount", toFullyDelete.size())
.detail("DeletingPartiallyCount", toPartiallyDelete.size());
state std::vector<Future<Void>> partialDeletions;
state int i;
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size());
}
// Go backwards through set of granules to guarantee deleting oldest first. This avoids orphaning granules in the
// deletion process
// FIXME: could track explicit parent dependencies and parallelize so long as a parent and child aren't running in
// parallel, but that's non-trivial
for (i = toFullyDelete.size() - 1; i >= 0; --i) {
state UID granuleId;
Key historyKey;
KeyRange keyRange;
Optional<UID> mergeChildId;
std::tie(granuleId, historyKey, keyRange, mergeChildId) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
if (!toFullyDelete.empty()) {
state std::vector<Future<Void>> fullDeletions;
KeyRangeMap<std::pair<Version, Future<Void>>> parentDelete;
parentDelete.insert(normalKeys, { 0, Future<Void>(Void()) });
std::vector<std::pair<Version, int>> deleteOrder;
deleteOrder.reserve(toFullyDelete.size());
for (int i = 0; i < toFullyDelete.size(); i++) {
deleteOrder.push_back({ std::get<4>(toFullyDelete[i]), i });
}
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force));
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
std::sort(deleteOrder.begin(), deleteOrder.end());
for (i = 0; i < deleteOrder.size(); i++) {
state UID granuleId;
Key historyKey;
KeyRange keyRange;
Optional<UID> mergeChildId;
Version startVersion;
std::tie(granuleId, historyKey, keyRange, mergeChildId, startVersion) =
toFullyDelete[deleteOrder[i].second];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
}
std::vector<Future<Void>> parents;
auto parentRanges = parentDelete.intersectingRanges(keyRange);
for (auto& it : parentRanges) {
if (startVersion <= it.cvalue().first) {
fmt::print("ERROR: [{0} - {1}) @ {2} <= [{3} - {4}) @ {5}\n",
keyRange.begin.printable(),
keyRange.end.printable(),
startVersion,
it.begin().printable(),
it.end().printable(),
it.cvalue().first);
}
ASSERT(startVersion > it.cvalue().first);
parents.push_back(it.cvalue().second);
}
Future<Void> deleteFuture = fullyDeleteGranule(
self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force, waitForAll(parents));
fullDeletions.push_back(deleteFuture);
parentDelete.insert(keyRange, { startVersion, deleteFuture });
}
wait(waitForAll(fullDeletions));
}
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size());
}
state std::vector<Future<Void>> partialDeletions;
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId;
KeyRange keyRange;
@ -4850,6 +4897,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
wait(waitForAll(partialDeletions));
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
if (force) {
tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -4875,6 +4927,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
}
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
// Now that all the necessary granules and their files have been deleted, we can
// clear the purgeIntent key to signify that the work is done. However, there could have been
// another purgeIntent that got written for this table while we were processing this one.
@ -5299,6 +5356,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
fmt::print("BM {} exiting because it is replaced\n", self->epoch);
}
TraceEvent("BlobManagerReplaced", bmInterf.id()).detail("Epoch", epoch);
wait(delay(0.0));
break;
}
when(HaltBlobManagerRequest req = waitNext(bmInterf.haltBlobManager.getFuture())) {

View File

@ -18,8 +18,6 @@
* limitations under the License.
*/
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
@ -35,6 +33,8 @@
#include "fdbserver/WaitFailure.h"
#include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // has to be last include
#include "flow/network.h"
#include <algorithm>
@ -72,7 +72,7 @@ public:
self->blobGranules_ = granules;
wait(prepare(self, normalKeys));
wait(advanceVersion(self));
wait(serverLoop(self));
return Void();
}
@ -148,9 +148,78 @@ private:
}
}
// Print migration progress periodically
ACTOR static Future<Void> logProgress(Reference<BlobMigrator> self) {
loop {
bool done = wait(checkProgress(self));
if (done)
return Void();
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
// Check key ranges that are migrated. Return true if all ranges are done
ACTOR static Future<bool> checkProgress(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
// Get key ranges that are still owned by the migrator. Those ranges are
// incompleted migrations
state UID serverID = self->interf_.ssi.id();
RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(serverID), normalKeys));
// Count incompleted size
int64_t incompleted = 0;
for (auto i = 0; i < ranges.size() - 1; ++i) {
if (ranges[i].value == serverKeysTrue) {
KeyRangeRef range(ranges[i].key, ranges[i + 1].key);
int64_t bytes = sizeInBytes(self, range);
dprint(" incompleted {}, size: {}\n", range.toString(), bytes);
incompleted += bytes;
}
}
// Calculated progress
int64_t total = sizeInBytes(self);
int progress = (total - incompleted) * 100 / total;
bool done = incompleted == 0;
dprint("Progress {} :{}%. done {}\n", serverID.toString(), progress, done);
return done;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
Version currentVersion = wait(tr.getRawReadVersion());
Version expectedVersion = maxVersion(self);
if (currentVersion <= expectedVersion) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned()));
dprint("Advance version from {} to {}\n", currentVersion, expectedVersion);
wait(tr.commit());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(logProgress(self));
self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self));
loop {

View File

@ -3549,7 +3549,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
continue;
}
state Reference<GranuleMetadata> metadata = m;
state Version granuleBeginVersion = req.beginVersion;
// state Version granuleBeginVersion = req.beginVersion;
// skip waiting for CF ready for recovery mode
if (!isFullRestoreMode()) {
choose {
@ -4470,9 +4470,10 @@ ACTOR Future<Void> handleRangeAssign(Reference<BlobWorkerData> bwData,
return Void();
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
if (!bwData->shuttingDown) {
if (!bwData->shuttingDown && !isSelfReassign) {
// the cancelled was because the granule open was cancelled, not because the whole blob
// worker was.
ASSERT(!req.reply.isSet());
req.reply.sendError(granule_assignment_conflict());
}
throw e;

View File

@ -25,6 +25,7 @@
#include <set>
#include <vector>
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbrpc/FailureMonitor.h"
@ -32,6 +33,7 @@
#include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/NativeAPI.actor.h"
@ -66,6 +68,7 @@
#include "fdbrpc/ReplicationUtils.h"
#include "fdbrpc/sim_validation.h"
#include "fdbclient/KeyBackedTypes.h"
#include "flow/Error.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -389,7 +392,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
wait(delay(0.0));
recoveryCore.cancel();
wait(cleanupRecoveryActorCollection(recoveryData, true /* exThrown */));
wait(cleanupRecoveryActorCollection(recoveryData, /*exThrown=*/true));
ASSERT(addActor.isEmpty());
CODE_PROBE(err.code() == error_code_tlog_failed, "Terminated due to tLog failure");
@ -1060,8 +1063,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
.detail("GrvProxies", req.grvProxies.size())
.detail("RecoveryCount", req.recoveryCount)
.detail("Stalled", req.recoveryStalled)
.detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch)
.detail("ClusterId", req.clusterId);
.detail("OldestBackupEpoch", req.logSystemConfig.oldestBackupEpoch);
// make sure the request comes from an active database
auto db = &self->db;
@ -1120,8 +1122,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
// Construct the client information
if (db->clientInfo->get().commitProxies != req.commitProxies ||
db->clientInfo->get().grvProxies != req.grvProxies ||
db->clientInfo->get().tenantMode != db->config.tenantMode || db->clientInfo->get().clusterId != req.clusterId ||
db->clientInfo->get().tenantMode != db->config.tenantMode ||
db->clientInfo->get().isEncryptionEnabled != SERVER_KNOBS->ENABLE_ENCRYPTION ||
db->clientInfo->get().clusterId != db->serverInfo->get().client.clusterId ||
db->clientInfo->get().clusterType != db->clusterType ||
db->clientInfo->get().metaclusterName != db->metaclusterName ||
db->clientInfo->get().encryptKeyProxy != db->serverInfo->get().encryptKeyProxy) {
@ -1133,9 +1136,9 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
.detail("ReqCPs", req.commitProxies)
.detail("TenantMode", db->clientInfo->get().tenantMode.toString())
.detail("ReqTenantMode", db->config.tenantMode.toString())
.detail("ClusterId", db->clientInfo->get().clusterId)
.detail("ReqClusterId", req.clusterId)
.detail("EncryptionEnabled", SERVER_KNOBS->ENABLE_ENCRYPTION)
.detail("ClusterId", db->serverInfo->get().client.clusterId)
.detail("ClientClusterId", db->clientInfo->get().clusterId)
.detail("ClusterType", db->clientInfo->get().clusterType)
.detail("ReqClusterType", db->clusterType)
.detail("MetaclusterName", db->clientInfo->get().metaclusterName)
@ -1149,7 +1152,7 @@ void clusterRegisterMaster(ClusterControllerData* self, RegisterMasterRequest co
clientInfo.commitProxies = req.commitProxies;
clientInfo.grvProxies = req.grvProxies;
clientInfo.tenantMode = TenantAPI::tenantModeForClusterType(db->clusterType, db->config.tenantMode);
clientInfo.clusterId = req.clusterId;
clientInfo.clusterId = db->serverInfo->get().client.clusterId;
clientInfo.clusterType = db->clusterType;
clientInfo.metaclusterName = db->metaclusterName;
db->clientInfo->set(clientInfo);
@ -1228,6 +1231,17 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
std::vector<NetworkAddress> coordinatorAddresses = wait(cs.tryResolveHostnames());
const WorkerInterface& w = req.wi;
if (req.clusterId.present() && self->clusterId->get().present() && req.clusterId != self->clusterId->get() &&
req.processClass != ProcessClass::TesterClass) {
TraceEvent(g_network->isSimulated() ? SevWarnAlways : SevError, "WorkerBelongsToExistingCluster", self->id)
.detail("WorkerClusterId", req.clusterId)
.detail("ClusterControllerClusterId", self->clusterId->get())
.detail("WorkerId", w.id())
.detail("ProcessId", w.locality.processId());
req.reply.sendError(invalid_cluster_id());
return Void();
}
ProcessClass newProcessClass = req.processClass;
auto info = self->id_worker.find(w.locality.processId());
ClusterControllerPriorityInfo newPriorityInfo = req.priorityInfo;
@ -2964,13 +2978,76 @@ ACTOR Future<Void> metaclusterMetricsUpdater(ClusterControllerData* self) {
}
}
// Update the DBInfo state with this processes cluster ID. If this process does
// not have a cluster ID and one does not exist in the database, generate one.
ACTOR Future<Void> updateClusterId(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
loop {
try {
state Optional<UID> durableClusterId = self->clusterId->get();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> clusterIdVal = wait(tr->get(clusterIdKey));
if (clusterIdVal.present()) {
UID clusterId = BinaryReader::fromStringRef<UID>(clusterIdVal.get(), IncludeVersion());
if (durableClusterId.present()) {
// If this process has an on disk file for the cluster ID,
// verify it matches the value in the database.
ASSERT(clusterId == durableClusterId.get());
} else {
// Otherwise, write the cluster ID in the database to the
// DbInfo object so all clients will learn of the cluster
// ID.
durableClusterId = clusterId;
}
} else if (!durableClusterId.present()) {
// No cluster ID exists in the database or on the machine. Generate and set one.
ASSERT(!durableClusterId.present());
durableClusterId = deterministicRandom()->randomUniqueID();
tr->set(clusterIdKey, BinaryWriter::toValue(durableClusterId.get(), IncludeVersion()));
wait(tr->commit());
}
auto serverInfo = self->db.serverInfo->get();
if (!serverInfo.client.clusterId.isValid()) {
ASSERT(durableClusterId.present());
serverInfo.id = deterministicRandom()->randomUniqueID();
serverInfo.client.clusterId = durableClusterId.get();
self->db.serverInfo->set(serverInfo);
ClientDBInfo clientInfo = self->db.clientInfo->get();
clientInfo.id = deterministicRandom()->randomUniqueID();
clientInfo.clusterId = durableClusterId.get();
self->db.clientInfo->set(clientInfo);
}
return Void();
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> handleGetEncryptionAtRestMode(ClusterControllerData* self, ClusterControllerFullInterface ccInterf) {
loop {
state GetEncryptionAtRestModeRequest req = waitNext(ccInterf.getEncryptionAtRestMode.getFuture());
TraceEvent("HandleGetEncryptionAtRestModeStart").detail("TlogId", req.tlogId);
EncryptionAtRestMode mode = wait(self->encryptionAtRestMode.getFuture());
GetEncryptionAtRestModeResponse resp;
resp.mode = mode;
req.reply.send(resp);
TraceEvent("HandleGetEncryptionAtRestModeEnd").detail("TlogId", req.tlogId).detail("Mode", resp.mode);
}
}
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
Future<Void> leaderFail,
ServerCoordinators coordinators,
LocalityData locality,
ConfigDBType configDBType,
Future<Void> recoveredDiskFiles) {
state ClusterControllerData self(interf, locality, coordinators);
Future<Void> recoveredDiskFiles,
Reference<AsyncVar<Optional<UID>>> clusterId) {
state ClusterControllerData self(interf, locality, coordinators, clusterId);
state Future<Void> coordinationPingDelay = delay(SERVER_KNOBS->WORKER_COORDINATION_PING_DELAY);
state uint64_t step = 0;
state Future<ErrorOr<Void>> error = errorOr(actorCollection(self.addActor.getFuture()));
@ -3007,6 +3084,8 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(updateClusterId(&self));
self.addActor.send(handleGetEncryptionAtRestMode(&self, interf));
self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY,
@ -3027,8 +3106,8 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Stop Received Signal", true);
}
// We shut down normally even if there was a serious error (so this fdbserver may be re-elected cluster
// controller)
// We shut down normally even if there was a serious error (so this fdbserver may be re-elected
// cluster controller)
return Void();
}
when(OpenDatabaseRequest req = waitNext(interf.clientInterface.openDatabase.getFuture())) {
@ -3123,7 +3202,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
LocalityData locality,
ConfigDBType configDBType,
Future<Void> recoveredDiskFiles) {
Future<Void> recoveredDiskFiles,
Reference<AsyncVar<Optional<UID>>> clusterId) {
loop {
state ClusterControllerFullInterface cci;
state bool inRole = false;
@ -3150,7 +3230,8 @@ ACTOR Future<Void> clusterController(ServerCoordinators coordinators,
startRole(Role::CLUSTER_CONTROLLER, cci.id(), UID());
inRole = true;
wait(clusterControllerCore(cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles));
wait(clusterControllerCore(
cci, leaderFail, coordinators, locality, configDBType, recoveredDiskFiles, clusterId));
}
} catch (Error& e) {
if (inRole)
@ -3174,14 +3255,15 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
Reference<AsyncVar<ClusterControllerPriorityInfo>> asyncPriorityInfo,
Future<Void> recoveredDiskFiles,
LocalityData locality,
ConfigDBType configDBType) {
ConfigDBType configDBType,
Reference<AsyncVar<Optional<UID>>> clusterId) {
// Defer this wait optimization of cluster configuration has 'Encryption data at-rest' enabled.
// Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of encryption keys
// created and managed by external KeyManagementService (KMS).
// Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of
// encryption keys created and managed by external KeyManagementService (KMS).
//
// TODO: Wait optimization is to ensure the worker server on the same process gets registered with the new CC before
// recruitment. Unify the codepath for both Encryption enable vs disable scenarios.
// TODO: Wait optimization is to ensure the worker server on the same process gets registered with the
// new CC before recruitment. Unify the codepath for both Encryption enable vs disable scenarios.
if (!SERVER_KNOBS->ENABLE_ENCRYPTION) {
wait(recoveredDiskFiles);
@ -3194,8 +3276,14 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
loop {
try {
ServerCoordinators coordinators(connRecord, configDBType);
wait(clusterController(
coordinators, currentCC, hasConnected, asyncPriorityInfo, locality, configDBType, recoveredDiskFiles));
wait(clusterController(coordinators,
currentCC,
hasConnected,
asyncPriorityInfo,
locality,
configDBType,
recoveredDiskFiles,
clusterId));
hasConnected = true;
} catch (Error& e) {
if (e.code() != error_code_coordinators_changed)
@ -3206,21 +3294,22 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
namespace {
// Tests `ClusterControllerData::updateWorkerHealth()` can update `ClusterControllerData::workerHealth` based on
// `UpdateWorkerHealth` request correctly.
// Tests `ClusterControllerData::updateWorkerHealth()` can update `ClusterControllerData::workerHealth`
// based on `UpdateWorkerHealth` request correctly.
TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
// Create a testing ClusterControllerData. Most of the internal states do not matter in this test.
state ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
state NetworkAddress workerAddress(IPAddress(0x01010101), 1);
state NetworkAddress badPeer1(IPAddress(0x02020202), 1);
state NetworkAddress badPeer2(IPAddress(0x03030303), 1);
state NetworkAddress badPeer3(IPAddress(0x04040404), 1);
// Create a `UpdateWorkerHealthRequest` with two bad peers, and they should appear in the `workerAddress`'s
// degradedPeers.
// Create a `UpdateWorkerHealthRequest` with two bad peers, and they should appear in the
// `workerAddress`'s degradedPeers.
{
UpdateWorkerHealthRequest req;
req.address = workerAddress;
@ -3281,8 +3370,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
previousRefreshTime = health.degradedPeers[badPeer3].lastRefreshTime;
}
// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker from
// `workerHealth`.
// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker
// from `workerHealth`.
{
wait(delay(0.001));
UpdateWorkerHealthRequest req;
@ -3308,7 +3397,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateRecoveredWorkers") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress worker1(IPAddress(0x01010101), 1);
NetworkAddress worker2(IPAddress(0x11111111), 1);
NetworkAddress badPeer1(IPAddress(0x02020202), 1);
@ -3357,15 +3447,16 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress worker(IPAddress(0x01010101), 1);
NetworkAddress badPeer1(IPAddress(0x02020202), 1);
NetworkAddress badPeer2(IPAddress(0x03030303), 1);
NetworkAddress badPeer3(IPAddress(0x04040404), 1);
NetworkAddress badPeer4(IPAddress(0x05050505), 1);
// Test that a reported degraded link should stay for sometime before being considered as a degraded link by
// cluster controller.
// Test that a reported degraded link should stay for sometime before being considered as a degraded
// link by cluster controller.
{
data.workerHealth[worker].degradedPeers[badPeer1] = { now(), now() };
data.workerHealth[worker].disconnectedPeers[badPeer2] = { now(), now() };
@ -3397,7 +3488,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
data.workerHealth.clear();
}
// Test that if both A complains B and B compalins A, only one of the server will be chosen as degraded server.
// Test that if both A complains B and B compalins A, only one of the server will be chosen as degraded
// server.
{
data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() };
@ -3478,8 +3570,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
data.workerHealth.clear();
}
// Test that if the degradation is reported both ways between A and other 4 servers, no degraded server is
// returned.
// Test that if the degradation is reported both ways between A and other 4 servers, no degraded server
// is returned.
{
ASSERT(SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE < 4);
data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
@ -3510,7 +3602,8 @@ TEST_CASE("/fdbserver/clustercontroller/recentRecoveryCountDueToHealth") {
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
ASSERT_EQ(data.recentRecoveryCountDueToHealth(), 0);
@ -3531,7 +3624,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerRecoveryDueToDegradedServer
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress master(IPAddress(0x01010101), 1);
NetworkAddress tlog(IPAddress(0x02020202), 1);
NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);
@ -3667,7 +3761,8 @@ TEST_CASE("/fdbserver/clustercontroller/shouldTriggerFailoverDueToDegradedServer
ClusterControllerData data(ClusterControllerFullInterface(),
LocalityData(),
ServerCoordinators(Reference<IClusterConnectionRecord>(
new ClusterConnectionMemoryRecord(ClusterConnectionString()))));
new ClusterConnectionMemoryRecord(ClusterConnectionString()))),
makeReference<AsyncVar<Optional<UID>>>());
NetworkAddress master(IPAddress(0x01010101), 1);
NetworkAddress tlog(IPAddress(0x02020202), 1);
NetworkAddress satelliteTlog(IPAddress(0x03030303), 1);

View File

@ -18,12 +18,14 @@
* limitations under the License.
*/
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Metacluster.h"
#include "fdbrpc/sim_validation.h"
#include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/BackupProgress.actor.h"
#include "fdbserver/ClusterRecovery.actor.h"
#include "fdbserver/EncryptionOpsUtils.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/MasterInterface.h"
#include "fdbserver/WaitFailure.h"
@ -297,7 +299,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
fRemoteWorkers,
self->clusterId,
self->configuration,
self->cstate.myDBState.recoveryCount + 1,
self->recoveryTransactionVersion,
@ -311,7 +312,6 @@ ACTOR Future<Void> newTLogServers(Reference<ClusterRecoveryData> self,
self->logSystem = Reference<ILogSystem>(); // Cancels the actors in the previous log system.
Reference<ILogSystem> newLogSystem = wait(oldLogSystem->newEpoch(recr,
Never(),
self->clusterId,
self->configuration,
self->cstate.myDBState.recoveryCount + 1,
self->recoveryTransactionVersion,
@ -347,7 +347,6 @@ ACTOR Future<Void> newSeedServers(Reference<ClusterRecoveryData> self,
isr.storeType = self->configuration.storageServerStoreType;
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = deterministicRandom()->randomUniqueID();
isr.clusterId = self->clusterId;
isr.initialClusterVersion = self->recoveryTransactionVersion;
ErrorOr<InitializeStorageReply> newServer = wait(recruits.storageServers[idx].storage.tryGetReply(isr));
@ -432,18 +431,34 @@ ACTOR Future<Void> rejoinRequestHandler(Reference<ClusterRecoveryData> self) {
}
}
namespace {
EncryptionAtRestMode getEncryptionAtRest() {
// TODO: Use db-config encryption config to determine cluster encryption status
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
return EncryptionAtRestMode(EncryptionAtRestMode::Mode::AES_256_CTR);
} else {
return EncryptionAtRestMode();
}
}
} // namespace
// Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery.
ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems,
Future<Void> minRecoveryDuration) {
state Future<Void> rejoinRequests = Never();
state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1;
state EncryptionAtRestMode encryptionAtRestMode = getEncryptionAtRest();
state DatabaseConfiguration configuration =
self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy
loop {
state DBCoreState newState;
self->logSystem->toCoreState(newState);
newState.recoveryCount = recoverCount;
// Update Coordinators EncryptionAtRest status during the very first recovery of the cluster (empty database)
newState.encryptionAtRestMode = encryptionAtRestMode;
state Future<Void> changed = self->logSystem->onCoreStateChanged();
ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum &&
@ -457,6 +472,7 @@ ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
.detail("FinalUpdate", finalUpdate)
.detail("NewState.tlogs", newState.tLogs.size())
.detail("NewState.OldTLogs", newState.oldTLogData.size())
.detail("NewState.EncryptionAtRestMode", newState.encryptionAtRestMode.toString())
.detail("Expected.tlogs",
configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional<Key>()));
wait(self->cstate.write(newState, finalUpdate));
@ -477,7 +493,6 @@ ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
self->dbgid)
.detail("StatusCode", RecoveryStatus::fully_recovered)
.detail("Status", RecoveryStatus::names[RecoveryStatus::fully_recovered])
.detail("ClusterId", self->clusterId)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_GENERATION_EVENT_NAME).c_str(),
@ -786,7 +801,6 @@ Future<Void> sendMasterRegistration(ClusterRecoveryData* self,
masterReq.priorCommittedLogServers = priorCommittedLogServers;
masterReq.recoveryState = self->recoveryState;
masterReq.recoveryStalled = self->recruitmentStalled->get();
masterReq.clusterId = self->clusterId;
return brokenPromiseToNever(self->clusterController.registerMaster.getReply(masterReq));
}
@ -939,7 +953,7 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
.detail("Status", RecoveryStatus::names[status])
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
return Never();
} else
} else {
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(),
self->dbgid)
.detail("StatusCode", RecoveryStatus::recruiting_transaction_servers)
@ -950,6 +964,12 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
.detail("RequiredResolvers", 1)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
// The cluster's EncryptionAtRest status is now readable.
if (self->controllerData->encryptionAtRestMode.canBeSet()) {
self->controllerData->encryptionAtRestMode.send(getEncryptionAtRest());
}
}
// FIXME: we only need log routers for the same locality as the master
int maxLogRouters = self->cstate.prevDBState.logRouterTags;
for (auto& old : self->cstate.prevDBState.oldTLogData) {
@ -1350,8 +1370,7 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
Reference<ILogSystem> oldLogSystem,
std::vector<StorageServerInterface>* seedServers,
std::vector<Standalone<CommitTransactionRef>>* initialConfChanges,
Future<Version> poppedTxsVersion,
bool* clusterIdExists) {
Future<Version> poppedTxsVersion) {
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), self->dbgid)
.detail("StatusCode", RecoveryStatus::reading_transaction_system_state)
.detail("Status", RecoveryStatus::names[RecoveryStatus::reading_transaction_system_state])
@ -1375,16 +1394,6 @@ ACTOR Future<Void> recoverFrom(Reference<ClusterRecoveryData> self,
debug_checkMaxRestoredVersion(UID(), self->lastEpochEnd, "DBRecovery");
// Generate a cluster ID to uniquely identify the cluster if it doesn't
// already exist in the txnStateStore.
Optional<Value> clusterId = self->txnStateStore->readValue(clusterIdKey).get();
*clusterIdExists = clusterId.present();
if (!clusterId.present()) {
self->clusterId = deterministicRandom()->randomUniqueID();
} else {
self->clusterId = BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
}
// Ordinarily we pass through this loop once and recover. We go around the loop if recovery stalls for more than a
// second, a provisional master is initialized, and an "emergency transaction" is submitted that might change the
// configuration so that we can finish recovery.
@ -1459,6 +1468,12 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
wait(self->cstate.read());
// Unless the cluster database is 'empty', the cluster's EncryptionAtRest status is readable once cstate is
// recovered
if (!self->cstate.myDBState.tLogs.empty() && self->controllerData->encryptionAtRestMode.canBeSet()) {
self->controllerData->encryptionAtRestMode.send(self->cstate.myDBState.encryptionAtRestMode);
}
if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion()) {
TraceEvent(SevWarnAlways, "IncompatibleProtocolVersion", self->dbgid).log();
throw internal_error();
@ -1540,7 +1555,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
state Future<Void> logChanges;
state Future<Void> minRecoveryDuration;
state Future<Version> poppedTxsVersion;
state bool clusterIdExists = false;
loop {
Reference<ILogSystem> oldLogSystem = oldLogSystems->get();
@ -1556,13 +1570,9 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
self->registrationTrigger.trigger();
choose {
when(wait(oldLogSystem ? recoverFrom(self,
oldLogSystem,
&seedServers,
&initialConfChanges,
poppedTxsVersion,
std::addressof(clusterIdExists))
: Never())) {
when(wait(oldLogSystem
? recoverFrom(self, oldLogSystem, &seedServers, &initialConfChanges, poppedTxsVersion)
: Never())) {
reg.cancel();
break;
}
@ -1591,7 +1601,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
.detail("Status", RecoveryStatus::names[RecoveryStatus::recovery_transaction])
.detail("PrimaryLocality", self->primaryLocality)
.detail("DcId", self->masterInterface.locality.dcId())
.detail("ClusterId", self->clusterId)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
// Recovery transaction
@ -1680,11 +1689,6 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
}
}
// Write cluster ID into txnStateStore if it is missing.
if (!clusterIdExists) {
tr.set(recoveryCommitRequest.arena, clusterIdKey, BinaryWriter::toValue(self->clusterId, Unversioned()));
}
applyMetadataMutations(SpanContext(),
self->dbgid,
recoveryCommitRequest.arena,

View File

@ -28,7 +28,7 @@
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/NativeAPI.actor.h"
@ -892,7 +892,7 @@ Optional<TenantName> getTenantName(ProxyCommitData* commitData, int64_t tenantId
if (tenantId != TenantInfo::INVALID_TENANT) {
auto itr = commitData->tenantIdIndex.find(tenantId);
if (itr != commitData->tenantIdIndex.end()) {
return Optional<TenantName>(itr->second);
return Optional<TenantName>(itr->second.get());
}
}
@ -1266,8 +1266,14 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
if (self->pProxyCommitData->isEncryptionEnabled) {
state EncryptCipherDomainId domainId = tenantId;
state MutationRef encryptedMutation;
CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::DISABLED,
"using disabled tenant mode");
CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::OPTIONAL_TENANT,
"using optional tenant mode");
CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::REQUIRED,
"using required tenant mode");
if (encryptedMutationOpt->present()) {
if (encryptedMutationOpt && encryptedMutationOpt->present()) {
CODE_PROBE(true, "using already encrypted mutation");
encryptedMutation = encryptedMutationOpt->get();
ASSERT(encryptedMutation.isEncrypted());
@ -1299,6 +1305,8 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
}
ASSERT(encryptedMutation.isEncrypted());
CODE_PROBE(true, "encrypting non-metadata mutations");
self->toCommit.writeTypedMessage(encryptedMutation);
return encryptedMutation;
} else {
@ -1473,12 +1481,12 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
if (!hasCandidateBackupKeys) {
continue;
}
if (m.type != MutationRef::Type::ClearRange) {
// Add the mutation to the relevant backup tag
for (auto backupName : pProxyCommitData->vecBackupKeys[m.param1]) {
// If encryption is enabled make sure the mutation we are writing is also encrypted
ASSERT(!self->pProxyCommitData->isEncryptionEnabled || writtenMutation.isEncrypted());
CODE_PROBE(writtenMutation.isEncrypted(), "using encrypted backup mutation");
self->logRangeMutations[backupName].push_back_deep(self->logRangeMutationsArena, writtenMutation);
}
} else {
@ -1500,6 +1508,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
// TODO (Nim): Currently clear ranges are encrypted using the default encryption key, this must be
// changed to account for clear ranges which span tenant boundaries
if (self->pProxyCommitData->isEncryptionEnabled) {
CODE_PROBE(true, "encrypting clear range backup mutation");
if (backupMutation.param1 == m.param1 && backupMutation.param2 == m.param2 &&
encryptedMutation.present()) {
backupMutation = encryptedMutation.get();
@ -1510,6 +1519,7 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
backupMutation =
backupMutation.encrypt(self->cipherKeys, domainId, arena, BlobCipherMetrics::BACKUP);
}
ASSERT(backupMutation.isEncrypted());
}
// Add the mutation to the relevant backup tag
@ -1613,8 +1623,27 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
idempotencyIdSet.param2 = kv.value;
auto& tags = pProxyCommitData->tagsForKey(kv.key);
self->toCommit.addTags(tags);
self->toCommit.writeTypedMessage(idempotencyIdSet);
if (self->pProxyCommitData->isEncryptionEnabled) {
CODE_PROBE(true, "encrypting idempotency mutation");
std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
getEncryptDetailsFromMutationRef(self->pProxyCommitData, idempotencyIdSet);
MutationRef encryptedMutation = idempotencyIdSet.encrypt(
self->cipherKeys, p.second, self->arena, BlobCipherMetrics::TLOG);
self->toCommit.writeTypedMessage(encryptedMutation);
} else {
self->toCommit.writeTypedMessage(idempotencyIdSet);
}
});
state int i = 0;
for (i = 0; i < pProxyCommitData->idempotencyClears.size(); i++) {
MutationRef& m = pProxyCommitData->idempotencyClears[i];
auto& tags = pProxyCommitData->tagsForKey(m.param1);
self->toCommit.addTags(tags);
// We already have an arena with an appropriate lifetime handy
Arena& arena = pProxyCommitData->idempotencyClears.arena();
wait(success(writeMutation(self, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, &m, nullptr, &arena)));
}
pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>();
self->toCommit.saveTags(self->writtenTags);
@ -1864,10 +1893,14 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
// Reset all to zero, used to track the correct index of each commitTransacitonRef on each resolver
std::fill(self->nextTr.begin(), self->nextTr.end(), 0);
std::unordered_map<uint8_t, int16_t> idCountsForKey;
for (int t = 0; t < self->trs.size(); t++) {
auto& tr = self->trs[t];
if (self->committed[t] == ConflictBatch::TransactionCommitted && (!self->locked || tr.isLockAware())) {
ASSERT_WE_THINK(self->commitVersion != invalidVersion);
if (self->trs[t].idempotencyId.valid()) {
idCountsForKey[uint8_t(t >> 8)] += 1;
}
tr.reply.send(CommitID(self->commitVersion, t, self->metadataVersionAfter));
} else if (self->committed[t] == ConflictBatch::TransactionTooOld) {
tr.reply.sendError(transaction_too_old());
@ -1910,10 +1943,15 @@ ACTOR Future<Void> reply(CommitBatchContext* self) {
bool filter = self->maxTransactionBytes >
pProxyCommitData->latencyBandConfig.get().commitConfig.maxCommitBytes.orDefault(
std::numeric_limits<int>::max());
pProxyCommitData->stats.commitLatencyBands.addMeasurement(duration, filter);
pProxyCommitData->stats.commitLatencyBands.addMeasurement(duration, 1, Filtered(filter));
}
}
for (auto [highOrderBatchIndex, count] : idCountsForKey) {
pProxyCommitData->expectedIdempotencyIdCountForKey.send(
ExpectedIdempotencyIdCountForKey{ self->commitVersion, count, highOrderBatchIndex });
}
++pProxyCommitData->stats.commitBatchOut;
pProxyCommitData->stats.txnCommitOut += self->trs.size();
pProxyCommitData->stats.txnConflicts += self->trs.size() - self->commitCount;
@ -2507,6 +2545,96 @@ ACTOR Future<Void> monitorTenantsOverStorageQuota(UID myID,
}
}
namespace {
struct ExpireServerEntry {
int64_t timeReceived;
int expectedCount = 0;
int receivedCount = 0;
bool initialized = false;
};
struct IdempotencyKey {
Version version;
uint8_t highOrderBatchIndex;
bool operator==(const IdempotencyKey& other) const {
return version == other.version && highOrderBatchIndex == other.highOrderBatchIndex;
}
};
} // namespace
namespace std {
template <>
struct hash<IdempotencyKey> {
std::size_t operator()(const IdempotencyKey& key) const {
std::size_t seed = 0;
boost::hash_combine(seed, std::hash<Version>{}(key.version));
boost::hash_combine(seed, std::hash<uint8_t>{}(key.highOrderBatchIndex));
return seed;
}
};
} // namespace std
ACTOR static Future<Void> idempotencyIdsExpireServer(
Database db,
PublicRequestStream<ExpireIdempotencyIdRequest> expireIdempotencyId,
PromiseStream<ExpectedIdempotencyIdCountForKey> expectedIdempotencyIdCountForKey,
Standalone<VectorRef<MutationRef>>* idempotencyClears) {
state std::unordered_map<IdempotencyKey, ExpireServerEntry> idStatus;
state std::unordered_map<IdempotencyKey, ExpireServerEntry>::iterator iter;
state int64_t purgeBefore;
state IdempotencyKey key;
state ExpireServerEntry* status = nullptr;
state Future<Void> purgeOld = Void();
loop {
choose {
when(ExpireIdempotencyIdRequest req = waitNext(expireIdempotencyId.getFuture())) {
key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
status = &idStatus[key];
status->receivedCount += 1;
CODE_PROBE(status->expectedCount == 0, "ExpireIdempotencyIdRequest received before count is known");
if (status->expectedCount > 0) {
ASSERT_LE(status->receivedCount, status->expectedCount);
}
}
when(ExpectedIdempotencyIdCountForKey req = waitNext(expectedIdempotencyIdCountForKey.getFuture())) {
key = IdempotencyKey{ req.commitVersion, req.batchIndexHighByte };
status = &idStatus[key];
ASSERT_EQ(status->expectedCount, 0);
status->expectedCount = req.idempotencyIdCount;
}
when(wait(purgeOld)) {
purgeOld = delay(SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME);
purgeBefore = now() - SERVER_KNOBS->IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
for (iter = idStatus.begin(); iter != idStatus.end();) {
// We have exclusive access to idStatus in this when block, so iter will still be valid after the
// wait
wait(yield());
if (iter->second.timeReceived < purgeBefore) {
iter = idStatus.erase(iter);
} else {
++iter;
}
}
continue;
}
}
if (status->initialized) {
if (status->receivedCount == status->expectedCount) {
auto keyRange =
makeIdempotencySingleKeyRange(idempotencyClears->arena(), key.version, key.highOrderBatchIndex);
idempotencyClears->push_back(idempotencyClears->arena(),
MutationRef(MutationRef::ClearRange, keyRange.begin, keyRange.end));
idStatus.erase(key);
}
} else {
status->timeReceived = now();
status->initialized = true;
}
}
}
namespace {
struct TransactionStateResolveContext {
@ -2771,6 +2899,10 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
addActor.send(rejoinServer(proxy, &commitData));
addActor.send(ddMetricsRequestServer(proxy, db));
addActor.send(reportTxnTagCommitCost(proxy.id(), db, &commitData.ssTrTagCommitCost));
addActor.send(idempotencyIdsExpireServer(openDBOnServer(db),
proxy.expireIdempotencyId,
commitData.expectedIdempotencyIdCountForKey,
&commitData.idempotencyClears));
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
// wait for txnStateStore recovery

View File

@ -382,7 +382,6 @@ ACTOR Future<bool> checkDataConsistency(Database cx,
// Note: this may cause some shards to be processed more than once or not at all in a non-quiescent database
state int effectiveClientCount = distributed ? clientCount : 1;
state int i = clientId * (shardSampleFactor + 1);
state int increment = (distributed && !firstClient) ? effectiveClientCount * shardSampleFactor : 1;
state int64_t rateLimitForThisRound =
*bytesReadInPrevRound == 0
? maxRate

View File

@ -272,9 +272,6 @@ ACTOR Future<Void> trackShardMetrics(DataDistributionTracker::SafeAccessor self,
state double lastLowBandwidthStartTime =
shardMetrics->get().present() ? shardMetrics->get().get().lastLowBandwidthStartTime : now();
state int shardCount = shardMetrics->get().present() ? shardMetrics->get().get().shardCount : 1;
state ReadBandwidthStatus readBandwidthStatus = shardMetrics->get().present()
? getReadBandwidthStatus(shardMetrics->get().get().metrics)
: ReadBandwidthStatusNormal;
state bool initWithNewMetrics = whenDDInit;
wait(delay(0, TaskPriority::DataDistribution));

View File

@ -1518,8 +1518,6 @@ public:
ServerStatus* status,
Version addedVersion) {
state StorageServerInterface interf = server->getLastKnownInterface();
state int targetTeamNumPerServer =
(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2;
loop {
state bool inHealthyZone = false; // healthChanged actor will be Never() if this flag is true
if (self->healthyZone.get().present()) {
@ -2284,15 +2282,12 @@ public:
self->recruitingIds.insert(interfaceId);
self->recruitingLocalities.insert(candidateWorker.worker.stableAddress());
UID clusterId = wait(self->getClusterId());
state InitializeStorageRequest isr;
isr.storeType = recruitTss ? self->configuration.testingStorageServerStoreType
: self->configuration.storageServerStoreType;
isr.seedTag = invalidTag;
isr.reqId = deterministicRandom()->randomUniqueID();
isr.interfaceId = interfaceId;
isr.clusterId = clusterId;
// if tss, wait for pair ss to finish and add its id to isr. If pair fails, don't recruit tss
state bool doRecruit = true;
@ -3470,10 +3465,6 @@ Future<Void> DDTeamCollection::monitorHealthyTeams() {
return DDTeamCollectionImpl::monitorHealthyTeams(this);
}
Future<UID> DDTeamCollection::getClusterId() {
return db->getClusterId();
}
Future<UID> DDTeamCollection::getNextWigglingServerID() {
Optional<Value> localityKey;
Optional<Value> localityValue;

View File

@ -221,21 +221,6 @@ class DDTxnProcessorImpl {
}
}
ACTOR static Future<UID> getClusterId(Database cx) {
state Transaction tr(cx);
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Optional<Value> clusterId = wait(tr.get(clusterIdKey));
ASSERT(clusterId.present());
return BinaryReader::fromStringRef<UID>(clusterId.get(), Unversioned());
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Read keyservers, return unique set of teams
ACTOR static Future<Reference<InitialDataDistribution>> getInitialDataDistribution(
Database cx,
@ -319,6 +304,7 @@ class DDTxnProcessorImpl {
for (int i = 0; i < dms.size(); ++i) {
auto dataMove = std::make_shared<DataMove>(decodeDataMoveValue(dms[i].value), true);
const DataMoveMetaData& meta = dataMove->meta;
ASSERT(!meta.ranges.empty());
for (const UID& id : meta.src) {
auto& dc = server_dc[id];
if (std::find(remoteDcIds.begin(), remoteDcIds.end(), dc) != remoteDcIds.end()) {
@ -340,11 +326,11 @@ class DDTxnProcessorImpl {
std::sort(dataMove->primaryDest.begin(), dataMove->primaryDest.end());
std::sort(dataMove->remoteDest.begin(), dataMove->remoteDest.end());
auto ranges = result->dataMoveMap.intersectingRanges(meta.range);
auto ranges = result->dataMoveMap.intersectingRanges(meta.ranges.front());
for (auto& r : ranges) {
ASSERT(!r.value()->valid);
}
result->dataMoveMap.insert(meta.range, std::move(dataMove));
result->dataMoveMap.insert(meta.ranges.front(), std::move(dataMove));
++numDataMoves;
}
@ -675,10 +661,6 @@ Future<int> DDTxnProcessor::tryUpdateReplicasKeyForDc(const Optional<Key>& dcId,
return DDTxnProcessorImpl::tryUpdateReplicasKeyForDc(cx, dcId, storageTeamSize);
}
Future<UID> DDTxnProcessor::getClusterId() const {
return DDTxnProcessorImpl::getClusterId(cx);
}
Future<Void> DDTxnProcessor::waitDDTeamInfoPrintSignal() const {
return DDTxnProcessorImpl::waitDDTeamInfoPrintSignal(cx);
}

View File

@ -90,7 +90,7 @@ void DataMove::validateShard(const DDShardInfo& shard, KeyRangeRef range, int pr
return;
}
ASSERT(this->meta.range.contains(range));
ASSERT(!this->meta.ranges.empty() && this->meta.ranges.front().contains(range));
if (!shard.hasDest) {
TraceEvent(SevError, "DataMoveValidationError")
@ -496,17 +496,21 @@ public:
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (meta.ranges.empty()) {
TraceEvent(SevWarnAlways, "EmptyDataMoveRange", self->ddId).detail("DataMoveMetaData", meta.toString());
continue;
}
if (it.value()->isCancelled() || (it.value()->valid && !SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.cancelled = true;
self->relocationProducer.send(rs);
TraceEvent("DDInitScheduledCancelDataMove", self->ddId).detail("DataMove", meta.toString());
} else if (it.value()->valid) {
TraceEvent(SevDebug, "DDInitFoundDataMove", self->ddId).detail("DataMove", meta.toString());
ASSERT(meta.range == it.range());
ASSERT(meta.ranges.front() == it.range());
// TODO: Persist priority in DataMoveMetaData.
RelocateShard rs(meta.range, DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
RelocateShard rs(meta.ranges.front(), DataMovementReason::RECOVER_MOVE, RelocateReason::OTHER);
rs.dataMoveId = meta.id;
rs.dataMove = it.value();
std::vector<ShardsAffectedByTeamFailure::Team> teams;

View File

@ -55,7 +55,7 @@ struct StringBuffer {
StringBuffer(UID fromFileID) : reserved(0), id(fromFileID) {}
int size() const { return str.size(); }
StringRef& ref() { return str; }
Standalone<StringRef> get() { return str; }
void clear() {
str = Standalone<StringRef>();
reserved = 0;
@ -63,19 +63,19 @@ struct StringBuffer {
void clearReserve(int size) {
str = Standalone<StringRef>();
reserved = size;
ref() = StringRef(new (str.arena()) uint8_t[size], 0);
str.contents() = StringRef(new (str.arena()) uint8_t[size], 0);
}
void append(StringRef x) { memcpy(append(x.size()), x.begin(), x.size()); }
void* append(int bytes) {
ASSERT(str.size() + bytes <= reserved);
void* p = const_cast<uint8_t*>(str.end());
ref() = StringRef(str.begin(), str.size() + bytes);
str.contents() = StringRef(str.begin(), str.size() + bytes);
return p;
}
StringRef pop_front(int bytes) {
ASSERT(bytes <= str.size());
StringRef result = str.substr(0, bytes);
ref() = str.substr(bytes);
str.contents() = str.substr(bytes);
return result;
}
void alignReserve(int alignment, int size) {
@ -101,7 +101,7 @@ struct StringBuffer {
if (str.size() > 0) {
memcpy(p, str.begin(), str.size());
}
ref() = StringRef(p, str.size());
str.contents() = StringRef(p, str.size());
}
}
};
@ -196,7 +196,7 @@ public:
stallCount.init("RawDiskQueue.StallCount"_sr);
}
Future<Void> pushAndCommit(StringRef pageData, StringBuffer* pageMem, uint64_t poppedPages) {
Future<Void> pushAndCommit(Standalone<StringRef> pageData, StringBuffer* pageMem, uint64_t poppedPages) {
return pushAndCommit(this, pageData, pageMem, poppedPages);
}
@ -332,13 +332,13 @@ public:
}
#endif
Future<Future<Void>> push(StringRef pageData, std::vector<Reference<SyncQueue>>* toSync) {
Future<Future<Void>> push(Standalone<StringRef> pageData, std::vector<Reference<SyncQueue>>* toSync) {
return push(this, pageData, toSync);
}
ACTOR static Future<Future<Void>> push(RawDiskQueue_TwoFiles* self,
StringRef pageData,
std::vector<Reference<SyncQueue>>* toSync) {
ACTOR static UNCANCELLABLE Future<Future<Void>> push(RawDiskQueue_TwoFiles* self,
Standalone<StringRef> pageData,
std::vector<Reference<SyncQueue>>* toSync) {
// Write the given data (pageData) to the queue files, swapping or extending them if necessary.
// Don't do any syncs, but push the modified file(s) onto toSync.
ASSERT(self->readingFile == 2);
@ -357,8 +357,9 @@ public:
toSync->push_back(self->files[1].syncQueue);
/*TraceEvent("RDQWriteAndSwap", this->dbgid).detail("File1name", self->files[1].dbgFilename).detail("File1size", self->files[1].size)
.detail("WritingPos", self->writingPos).detail("WritingBytes", p);*/
waitfor.push_back(self->files[1].f->write(pageData.begin(), p, self->writingPos));
pageData = pageData.substr(p);
waitfor.push_back(uncancellable(
holdWhile(pageData, self->files[1].f->write(pageData.begin(), p, self->writingPos))));
pageData.contents() = pageData.substr(p);
}
self->dbg_file0BeginSeq += self->files[0].size;
@ -426,7 +427,8 @@ public:
.detail("WritingPos", self->writingPos).detail("WritingBytes", pageData.size());*/
self->files[1].size = std::max(self->files[1].size, self->writingPos + pageData.size());
toSync->push_back(self->files[1].syncQueue);
waitfor.push_back(self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos));
waitfor.push_back(uncancellable(
holdWhile(pageData, self->files[1].f->write(pageData.begin(), pageData.size(), self->writingPos))));
self->writingPos += pageData.size();
return waitForAllReadyThenThrow(waitfor);
@ -435,7 +437,7 @@ public:
// Write the given data (pageData) to the queue files of self, sync data to disk, and delete the memory (pageMem)
// that hold the pageData
ACTOR static UNCANCELLABLE Future<Void> pushAndCommit(RawDiskQueue_TwoFiles* self,
StringRef pageData,
Standalone<StringRef> pageData,
StringBuffer* pageMem,
uint64_t poppedPages) {
state Promise<Void> pushing, committed;
@ -983,7 +985,7 @@ public:
lastCommittedSeq = backPage().endSeq();
auto f = rawQueue->pushAndCommit(
pushed_page_buffer->ref(), pushed_page_buffer, poppedSeq / sizeof(Page) - lastPoppedSeq / sizeof(Page));
pushed_page_buffer->get(), pushed_page_buffer, poppedSeq / sizeof(Page) - lastPoppedSeq / sizeof(Page));
lastPoppedSeq = poppedSeq;
pushed_page_buffer = 0;
return f;
@ -1064,7 +1066,7 @@ private:
};
uint64_t seq; // seq is the index of the virtually infinite disk queue file. Its unit is bytes.
uint64_t popped;
int payloadSize;
int32_t payloadSize;
};
// The on disk format depends on the size of PageHeader.
static_assert(sizeof(PageHeader) == 36, "PageHeader must be 36 bytes");
@ -1179,7 +1181,7 @@ private:
Standalone<StringRef> pagedData = wait(readPages(self, start, end));
const int startOffset = start % _PAGE_SIZE;
const int dataLen = end - start;
ASSERT(pagedData.substr(startOffset, dataLen).compare(buffer->ref().substr(0, dataLen)) == 0);
ASSERT(pagedData.substr(startOffset, dataLen).compare(buffer->get().substr(0, dataLen)) == 0);
} catch (Error& e) {
if (e.code() != error_code_io_error) {
delete buffer;
@ -1546,9 +1548,9 @@ private:
StringBuffer* pushed_page_buffer;
Page& backPage() {
ASSERT(pushedPageCount());
return ((Page*)pushed_page_buffer->ref().end())[-1];
return ((Page*)pushed_page_buffer->get().end())[-1];
}
Page const& backPage() const { return ((Page*)pushed_page_buffer->ref().end())[-1]; }
Page const& backPage() const { return ((Page*)pushed_page_buffer->get().end())[-1]; }
int pushedPageCount() const { return pushed_page_buffer ? pushed_page_buffer->size() / sizeof(Page) : 0; }
// Recovery state
@ -1662,3 +1664,43 @@ IDiskQueue* openDiskQueue(std::string basename,
int64_t fileSizeWarningLimit) {
return new DiskQueue_PopUncommitted(basename, ext, dbgid, dqv, fileSizeWarningLimit);
}
TEST_CASE("performance/fdbserver/DiskQueue") {
state IDiskQueue* queue =
openDiskQueue("test-", "fdq", deterministicRandom()->randomUniqueID(), DiskQueueVersion::V2);
state std::string valueString = std::string(10e6, '.');
state StringRef valueStr((uint8_t*)valueString.c_str(), 10e6);
state std::deque<IDiskQueue::location> locations;
state int loopCount = 0;
state Future<Void> lastCommit = Void();
bool fullyRecovered = wait(queue->initializeRecovery(0));
if (!fullyRecovered) {
loop {
Standalone<StringRef> h = wait(queue->readNext(1e6));
if (h.size() < 1e6) {
break;
}
}
}
while (loopCount < 4000) {
if (loopCount % 100 == 0) {
printf("loop count: %d\n", loopCount);
}
if (++loopCount % 2 == 0) {
state IDiskQueue::location frontLocation = locations.front();
locations.pop_front();
if (locations.size() > 10) {
Standalone<StringRef> r = wait(queue->read(frontLocation, locations.front(), CheckHashes::True));
}
queue->pop(frontLocation);
}
wait(delay(0.001));
locations.push_back(queue->push(valueStr));
Future<Void> prevCommit = lastCommit;
lastCommit = queue->commit();
wait(prevCommit);
}
queue->dispose();
wait(queue->onClosed());
return Void();
}

Some files were not shown because too many files have changed in this diff Show More