Merge remote-tracking branch 'origin/main' into authz-security-tests

This commit is contained in:
Junhyun Shim 2022-11-08 13:15:37 +01:00
commit 66e3050062
211 changed files with 6476 additions and 2584 deletions

View File

@ -1,5 +1,5 @@
[flake8]
ignore = E203, E266, E501, W503, F403, F401, E711
ignore = E203, E266, E501, W503, F403, F401, E711, C901, W605
max-line-length = 79
max-complexity = 18
select = B,C,E,F,W,T4,B9

View File

@ -80,7 +80,7 @@ class Result:
if len(t1) != len(t2):
return False
return all([Result.elements_equal(x,y) for x,y in zip(t1, t2)])
return all([Result.elements_equal(x, y) for x, y in zip(t1, t2)])
def matches_key(self, rhs, specification):
if not isinstance(rhs, Result):

View File

@ -464,7 +464,7 @@ def parse_args(argv):
# SOMEDAY: this applies only to the scripted test. Should we invoke test files specifically (as in circus),
# or invoke them here and allow tests to add arguments?
parser.add_argument('--no-threads', action='store_true', help='Disables the START_THREAD instruction in the scripted test.')
parser.add_argument('--no-directory-snapshot-ops', action='store_true', help='Disables snapshot operations for directory instructions.')
parser.add_argument('--no-tenants', action='store_true', help='Disables tenant operations.')

View File

@ -577,7 +577,7 @@ class ApiTest(Test):
key1, key2 = key2, key1
# TODO: randomize chunkSize but should not exceed 100M(shard limit)
chunkSize = 10000000 # 10M
chunkSize = 10000000 # 10M
instructions.push_args(key1, key2, chunkSize)
instructions.append(op)
self.add_strings(1)

View File

@ -114,7 +114,7 @@ class DirectoryTest(Test):
instructions.push_args(layer)
instructions.push_args(*test_util.with_length(path))
instructions.append('DIRECTORY_OPEN')
self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer==b'partition'))))
self.dir_list.append(self.root.add_child(path, DirectoryStateTreeNode(True, True, has_known_prefix=False, is_partition=(layer == b'partition'))))
# print('%d. Selected %s, dir=%s, dir_id=%s, has_known_prefix=%s, dir_list_len=%d' \
# % (len(instructions), 'DIRECTORY_OPEN', repr(self.dir_index), self.dir_list[-1].dir_id, False, len(self.dir_list)-1))
@ -163,8 +163,8 @@ class DirectoryTest(Test):
elif root_op == 'DIRECTORY_CREATE_LAYER':
indices = []
prefixes = [generate_prefix(require_unique=args.concurrency==1, is_partition=True) for i in range(2)]
prefixes = [generate_prefix(require_unique=args.concurrency == 1, is_partition=True) for i in range(2)]
for i in range(2):
instructions.push_args(prefixes[i])
instructions.push_args(*test_util.with_length(generate_path()))
@ -184,9 +184,9 @@ class DirectoryTest(Test):
test_util.blocking_commit(instructions)
path = generate_path()
# Partitions that use the high-contention allocator can result in non-determinism if they fail to commit,
# Partitions that use the high-contention allocator can result in non-determinism if they fail to commit,
# so we disallow them in comparison tests
op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency>1),)
op_args = test_util.with_length(path) + (self.generate_layer(allow_partition=args.concurrency > 1),)
directory_util.push_instruction_and_record_prefix(instructions, op, op_args, path, len(self.dir_list), self.random, self.prefix_log)
if not op.endswith('_DATABASE') and args.concurrency == 1:
@ -196,14 +196,14 @@ class DirectoryTest(Test):
if child_entry is None:
child_entry = DirectoryStateTreeNode(True, True)
child_entry.state.has_known_prefix = False
child_entry.state.has_known_prefix = False
self.dir_list.append(dir_entry.add_child(path, child_entry))
elif root_op == 'DIRECTORY_CREATE':
layer = self.generate_layer()
is_partition = layer == b'partition'
prefix = generate_prefix(require_unique=is_partition and args.concurrency==1, is_partition=is_partition, min_length=0)
prefix = generate_prefix(require_unique=is_partition and args.concurrency == 1, is_partition=is_partition, min_length=0)
# Because allocated prefixes are non-deterministic, we cannot have overlapping
# transactions that allocate/remove these prefixes in a comparison test
@ -409,7 +409,7 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
if require_unique:
min_length = max(min_length, 16)
length = random.randint(min_length, min_length+5)
length = random.randint(min_length, min_length + 5)
if length == 0:
return b''
@ -419,6 +419,6 @@ def generate_prefix(require_unique=False, is_partition=False, min_length=1):
else:
return bytes([random.randrange(ord('\x02'), ord('\x14')) for i in range(0, length)])
else:
prefix = fixed_prefix
prefix = fixed_prefix
generated = prefix[0:random.randrange(min_length, len(prefix))]
return generated

View File

@ -1,5 +1,6 @@
import sys
class TreeNodeState:
def __init__(self, node, dir_id, is_directory, is_subspace, has_known_prefix, root, is_partition):
self.dir_id = dir_id
@ -9,10 +10,11 @@ class TreeNodeState:
self.root = root
self.is_partition = is_partition
self.parents = { node }
self.parents = {node}
self.children = {}
self.deleted = False
# Represents an element of the directory hierarchy. As a result of various operations (e.g. moves) that
# may or may not have succeeded, a node can represent multiple possible states.
class DirectoryStateTreeNode:
@ -25,7 +27,7 @@ class DirectoryStateTreeNode:
default_directory = None
# Used for debugging
dir_id = 0
dir_id = 0
@classmethod
def reset(cls):
@ -62,7 +64,7 @@ class DirectoryStateTreeNode:
if default is not None:
default_child = default.state.children.get(subpath[0])
self_child = self.state.children.get(subpath[0])
self_child = self.state.children.get(subpath[0])
if self_child is None:
if default_child is None:
@ -143,13 +145,15 @@ class DirectoryStateTreeNode:
child = self.get_descendent(path)
if child:
child._delete_impl()
def validate_dir(dir, root):
if dir.state.is_directory:
assert dir.state.root == root
else:
assert dir.state.root == dir
def run_test():
all_entries = []
@ -249,11 +253,11 @@ def run_test():
# Test moving an entry
assert not entry.state.has_known_prefix
assert not entry.state.is_subspace
assert list(entry.state.children.keys()) == ['1']
assert list(entry.state.children.keys()) == ['1']
for e in all_entries:
validate_dir(e, root)
if __name__ == '__main__':
sys.exit(run_test())

View File

@ -18,7 +18,6 @@
# limitations under the License.
#
import random
import struct
import fdb
@ -35,6 +34,7 @@ DEFAULT_DIRECTORY_INDEX = 4
DEFAULT_DIRECTORY_PREFIX = b'default'
DIRECTORY_ERROR_STRING = b'DIRECTORY_ERROR'
def setup_directories(instructions, default_path, random):
# Clients start with the default directory layer in the directory list
DirectoryStateTreeNode.reset()

View File

@ -107,7 +107,7 @@ class RandomGenerator(object):
user_version = random.randint(0, 0xffff)
tup.append(fdb.tuple.Versionstamp(tr_version, user_version))
else:
assert false
assert False
return tuple(tup)

View File

@ -31,6 +31,7 @@ from bindingtester.tests import test_util
fdb.api_version(FDB_API_VERSION)
class TupleTest(Test):
def __init__(self, subspace):
super(TupleTest, self).__init__(subspace)
@ -44,14 +45,14 @@ class TupleTest(Test):
def generate(self, args, thread_number):
instructions = InstructionSet()
min_value = -2**self.max_int_bits+1
max_value = 2**self.max_int_bits-1
min_value = -2**self.max_int_bits + 1
max_value = 2**self.max_int_bits - 1
instructions.append('NEW_TRANSACTION')
# Test integer encoding
mutations = 0
for i in range(0, self.max_int_bits+1):
for i in range(0, self.max_int_bits + 1):
for sign in [-1, 1]:
sign_str = '' if sign == 1 else '-'
for offset in range(-10, 11):

View File

@ -285,7 +285,7 @@ if(NOT WIN32)
--api-tester-bin $<TARGET_FILE:fdb_c_api_tester>
--external-client-library ${CMAKE_CURRENT_BINARY_DIR}/libfdb_c_external.so
--test-file ${test_file}
--knob delete-native-lib-after-loading=false
--retain-client-lib-copies
)
set_tests_properties("${test_name}" PROPERTIES TIMEOUT 300)
endforeach()
@ -442,7 +442,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
DEPENDS ${IMPLIBSO_SRC} fdb_c
COMMENT "Generating source code for C shim library")
add_library(fdb_c_shim STATIC ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
add_library(fdb_c_shim SHARED ${SHIM_LIB_GEN_SRC} foundationdb/fdb_c_shim.h fdb_c_shim.cpp)
target_link_options(fdb_c_shim PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/fdb_c.map,-z,nodelete,-z,noexecstack")
target_link_libraries(fdb_c_shim PUBLIC dl)
target_include_directories(fdb_c_shim PUBLIC

View File

@ -21,7 +21,7 @@
#include "fdbclient/FDBTypes.h"
#include "flow/ProtocolVersion.h"
#include <cstdint>
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#define FDB_INCLUDE_LEGACY_TYPES
#include "fdbclient/MultiVersionTransaction.h"
@ -905,6 +905,10 @@ extern "C" DLLEXPORT fdb_error_t fdb_transaction_get_committed_version(FDBTransa
CATCH_AND_RETURN(*out_version = TXN(tr)->getCommittedVersion(););
}
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr) {
return (FDBFuture*)TXN(tr)->getTotalCost().extractPtr();
}
extern "C" DLLEXPORT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr) {
return (FDBFuture*)TXN(tr)->getApproximateSize().extractPtr();
}

View File

@ -27,10 +27,10 @@
#endif
#if !defined(FDB_API_VERSION)
#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 720)
#error You must #define FDB_API_VERSION prior to including fdb_c.h (current version is 730)
#elif FDB_API_VERSION < 13
#error API version no longer supported (upgrade to 13)
#elif FDB_API_VERSION > 720
#elif FDB_API_VERSION > 730
#error Requested API version requires a newer version of this header
#endif
@ -514,12 +514,14 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_transaction_get_committed_version(F
int64_t* out_version);
/*
* This function intentionally returns an FDBFuture instead of an integer
* directly, so that calling this API can see the effect of previous
* These functions intentionally return an FDBFuture instead of an integer
* directly, so that calling the API can see the effect of previous
* mutations on the transaction. Specifically, mutations are applied
* asynchronously by the main thread. In order to see them, this call has to
* be serviced by the main thread too.
*/
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_total_cost(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_approximate_size(FDBTransaction* tr);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_get_versionstamp(FDBTransaction* tr);

View File

@ -31,8 +31,8 @@ func_re = re.compile(
"^\s*FDB_API_(?:CHANGED|REMOVED)\s*\(\s*([^,]*),\s*([^)]*)\).*")
with open(source, 'r') as srcfile:
for l in srcfile:
m = func_re.match(l)
for line in srcfile:
m = func_re.match(line)
if m:
func, ver = m.groups()
if func not in functions:
@ -59,7 +59,7 @@ def write_windows_asm(asmfile, functions):
def write_unix_asm(asmfile, functions, prefix):
if cpu != "aarch64" and cpu!= "ppc64le":
if cpu != "aarch64" and cpu != "ppc64le":
asmfile.write(".intel_syntax noprefix\n")
i = 0
@ -132,7 +132,7 @@ def write_unix_asm(asmfile, functions, prefix):
asmfile.write("\tstd 31, -8(1)\n")
asmfile.write("\tstd 0,16(1)\n")
asmfile.write("\tstdu 1,-192(1)\n")
#asmfile.write("\tstd 2,24(1)\n")
# asmfile.write("\tstd 2,24(1)\n")
asmfile.write("\taddis 11,2,.LC%d@toc@ha\n" % (i))
asmfile.write("\tld 11,.LC%d@toc@l(11)\n" % (i))
asmfile.write("\tld 12,0(11)\n")

View File

@ -20,11 +20,14 @@
#include "TesterApiWorkload.h"
#include "TesterBlobGranuleUtil.h"
#include "TesterUtil.h"
#include <unordered_set>
#include <memory>
#include <fmt/format.h>
namespace FdbApiTester {
#define BG_API_DEBUG_VERBOSE false
class ApiBlobGranuleCorrectnessWorkload : public ApiWorkload {
public:
ApiBlobGranuleCorrectnessWorkload(const WorkloadConfig& config) : ApiWorkload(config) {
@ -35,7 +38,7 @@ public:
}
private:
// FIXME: use other new blob granule apis!
// FIXME: add tenant support for DB operations
enum OpType {
OP_INSERT,
OP_CLEAR,
@ -51,7 +54,27 @@ private:
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
bool seenReadSuccess = false;
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
tenantDebugString(tenantId),
message));
}
}
void randomReadOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
@ -63,8 +86,10 @@ private:
auto results = std::make_shared<std::vector<fdb::KeyValue>>();
auto tooOld = std::make_shared<bool>(false);
debugOp("Read", begin, end, tenantId, "starting");
execTransaction(
[this, begin, end, results, tooOld](auto ctx) {
[this, begin, end, tenantId, results, tooOld](auto ctx) {
ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
TesterGranuleContext testerContext(ctx->getBGBasePath());
fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
@ -74,8 +99,13 @@ private:
auto out = fdb::Result::KeyValueRefArray{};
fdb::Error err = res.getKeyValueArrayNothrow(out);
if (err.code() == error_code_blob_granule_transaction_too_old) {
info("BlobGranuleCorrectness::randomReadOp bg too old\n");
ASSERT(!seenReadSuccess);
bool previousSuccess = seenReadSuccess(tenantId);
if (previousSuccess) {
error("Read bg too old after read success!\n");
} else {
info("Read bg too old\n");
}
ASSERT(!previousSuccess);
*tooOld = true;
ctx->done();
} else if (err.code() != error_code_success) {
@ -85,10 +115,13 @@ private:
auto& [resVector, out_more] = resCopy;
ASSERT(!out_more);
results.get()->assign(resVector.begin(), resVector.end());
if (!seenReadSuccess) {
info("BlobGranuleCorrectness::randomReadOp first success\n");
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
}
seenReadSuccess = true;
ctx->done();
}
},
@ -97,7 +130,7 @@ private:
std::vector<fdb::KeyValue> expected =
stores[tenantId].getRange(begin, end, stores[tenantId].size(), false);
if (results->size() != expected.size()) {
error(fmt::format("randomReadOp result size mismatch. expected: {} actual: {}",
error(fmt::format("randomReadOp result size mismatch. expected: {0} actual: {1}",
expected.size(),
results->size()));
}
@ -105,7 +138,7 @@ private:
for (int i = 0; i < results->size(); i++) {
if ((*results)[i].key != expected[i].key) {
error(fmt::format("randomReadOp key mismatch at {}/{}. expected: {} actual: {}",
error(fmt::format("randomReadOp key mismatch at {0}/{1}. expected: {2} actual: {3}",
i,
results->size(),
fdb::toCharsRef(expected[i].key),
@ -138,6 +171,8 @@ private:
}
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
debugOp("GetGranules", begin, end, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
@ -149,15 +184,17 @@ private:
},
true);
},
[this, begin, end, results, cont]() {
this->validateRanges(results, begin, end, seenReadSuccess);
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetGranules", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
getTenant(tenantId));
}
void randomSummarizeOp(TTaskFct cont, std::optional<int> tenantId) {
if (!seenReadSuccess) {
if (!seenReadSuccess(tenantId)) {
// tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a
// read success
schedule(cont);
@ -169,6 +206,9 @@ private:
std::swap(begin, end);
}
auto results = std::make_shared<std::vector<fdb::GranuleSummary>>();
debugOp("Summarize", begin, end, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType();
@ -180,10 +220,11 @@ private:
},
true);
},
[this, begin, end, results, cont]() {
ASSERT(results->size() > 0);
ASSERT(results->front().keyRange.beginKey <= begin);
ASSERT(results->back().keyRange.endKey >= end);
[this, begin, end, tenantId, results, cont]() {
debugOp("Summarize", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
// use validateRanges to share validation
auto ranges = std::make_shared<std::vector<fdb::KeyRange>>();
for (int i = 0; i < results->size(); i++) {
// TODO: could do validation of subsequent calls and ensure snapshot version never decreases
@ -191,12 +232,11 @@ private:
ASSERT((*results)[i].snapshotVersion <= (*results)[i].deltaVersion);
ASSERT((*results)[i].snapshotSize > 0);
ASSERT((*results)[i].deltaSize >= 0);
ranges->push_back((*results)[i].keyRange);
}
for (int i = 1; i < results->size(); i++) {
// ranges contain entire requested key range
ASSERT((*results)[i].keyRange.beginKey == (*results)[i - 1].keyRange.endKey);
}
this->validateRanges(ranges, begin, end, true);
schedule(cont);
},
@ -208,18 +248,29 @@ private:
fdb::Key end,
bool shouldBeRanges) {
if (shouldBeRanges) {
if (results->size() == 0) {
error(fmt::format(
"ValidateRanges: [{0} - {1}): No ranges returned!", fdb::toCharsRef(begin), fdb::toCharsRef(end)));
}
ASSERT(results->size() > 0);
if (results->front().beginKey > begin || results->back().endKey < end) {
error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(results->front().beginKey),
fdb::toCharsRef(results->back().endKey)));
}
ASSERT(results->front().beginKey <= begin);
ASSERT(results->back().endKey >= end);
}
for (int i = 0; i < results->size(); i++) {
// no empty or inverted ranges
if ((*results)[i].beginKey >= (*results)[i].endKey) {
error(fmt::format("Empty/inverted range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey),
error(fmt::format("ValidateRanges: [{0} - {1}): Empty/inverted range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end)));
fdb::toCharsRef(end),
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey)));
}
ASSERT((*results)[i].beginKey < (*results)[i].endKey);
}
@ -227,16 +278,17 @@ private:
for (int i = 1; i < results->size(); i++) {
// ranges contain entire requested key range
if ((*results)[i].beginKey != (*results)[i].endKey) {
error(fmt::format("Non-contiguous range [{0} - {1}) for getBlobGranuleRanges({2} - {3})",
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey),
error(fmt::format("ValidateRanges: [{0} - {1}): Non-covereed range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end)));
fdb::toCharsRef(end),
fdb::toCharsRef((*results)[i - 1].endKey),
fdb::toCharsRef((*results)[i].endKey)));
}
ASSERT((*results)[i].beginKey == (*results)[i - 1].endKey);
}
}
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
@ -244,6 +296,10 @@ private:
if (begin > end) {
std::swap(begin, end);
}
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
@ -252,22 +308,27 @@ private:
ctx->done();
});
},
[this, begin, end, results, cont]() {
this->validateRanges(results, begin, end, seenReadSuccess);
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetBlobRanges", begin, end, tenantId, fmt::format("complete with {0} ranges", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
/* failOnError = */ false);
}
// TODO: tenant support
void randomVerifyOp(TTaskFct cont) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) {
std::swap(begin, end);
}
auto verifyVersion = std::make_shared<int64_t>(false);
// info("Verify op starting");
debugOp("Verify", begin, end, tenantId, "starting");
execOperation(
[begin, end, verifyVersion](auto ctx) {
@ -277,16 +338,15 @@ private:
ctx->done();
});
},
[this, begin, end, verifyVersion, cont]() {
[this, begin, end, tenantId, verifyVersion, cont]() {
debugOp("Verify", begin, end, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
bool previousSuccess = seenReadSuccess(tenantId);
if (*verifyVersion == -1) {
ASSERT(!seenReadSuccess);
} else {
if (!seenReadSuccess) {
info("BlobGranuleCorrectness::randomVerifyOp first success");
}
seenReadSuccess = true;
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
setReadSuccess(tenantId);
}
// info(fmt::format("verify op done @ {}", *verifyVersion));
schedule(cont);
},
/* failOnError = */ false);

View File

@ -57,6 +57,7 @@ public:
std::string tlsCertFile;
std::string tlsKeyFile;
std::string tlsCaFile;
bool retainClientLibCopies = false;
};
} // namespace FdbApiTester

View File

@ -27,7 +27,7 @@
#include <unordered_map>
#include <vector>
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
namespace FdbApiTester {

View File

@ -36,7 +36,7 @@ namespace FdbApiTester {
namespace {
#define API_VERSION_CLIENT_TMP_DIR 720
#define API_VERSION_CLIENT_TMP_DIR 730
enum TesterOptionId {
OPT_CONNFILE,
@ -61,6 +61,7 @@ enum TesterOptionId {
OPT_TLS_CERT_FILE,
OPT_TLS_KEY_FILE,
OPT_TLS_CA_FILE,
OPT_RETAIN_CLIENT_LIB_COPIES,
};
CSimpleOpt::SOption TesterOptionDefs[] = //
@ -89,6 +90,7 @@ CSimpleOpt::SOption TesterOptionDefs[] = //
{ OPT_TLS_CERT_FILE, "--tls-cert-file", SO_REQ_SEP },
{ OPT_TLS_KEY_FILE, "--tls-key-file", SO_REQ_SEP },
{ OPT_TLS_CA_FILE, "--tls-ca-file", SO_REQ_SEP },
{ OPT_RETAIN_CLIENT_LIB_COPIES, "--retain-client-lib-copies", SO_NONE },
SO_END_OF_OPTIONS };
void printProgramUsage(const char* execName) {
@ -140,6 +142,8 @@ void printProgramUsage(const char* execName) {
" Path to file containing client's TLS private key\n"
" --tls-ca-file FILE\n"
" Path to file containing TLS CA certificate\n"
" --retain-client-lib-copies\n"
" Retain temporary external client library copies\n"
" -h, --help Display this help and exit.\n",
FDB_API_VERSION);
}
@ -251,6 +255,9 @@ bool processArg(TesterOptions& options, const CSimpleOpt& args) {
case OPT_TLS_CA_FILE:
options.tlsCaFile.assign(args.OptionArg());
break;
case OPT_RETAIN_CLIENT_LIB_COPIES:
options.retainClientLibCopies = true;
break;
}
return true;
}
@ -348,6 +355,10 @@ void applyNetworkOptions(TesterOptions& options) {
if (!options.tlsCaFile.empty()) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TLS_CA_PATH, options.tlsCaFile);
}
if (options.retainClientLibCopies) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_RETAIN_CLIENT_LIBRARY_COPIES);
}
}
void randomizeOptions(TesterOptions& options) {
@ -459,8 +470,10 @@ int main(int argc, char** argv) {
retCode = 1;
}
fprintf(stderr, "Stopping FDB network thread\n");
fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join();
fprintf(stderr, "FDB network thread successfully stopped\n");
} catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1;

View File

@ -38,6 +38,7 @@ from tmp_cluster import TempCluster
from local_cluster import TLSConfig
# fmt: on
TESTER_STATS_INTERVAL_SEC = 5
@ -98,6 +99,9 @@ def run_tester(args, cluster, test_file):
external_client_library = Path(args.external_client_library).resolve()
cmd += ["--external-client-library", external_client_library]
if args.retain_client_lib_copies:
cmd += ["--retain-client-lib-copies"]
if cluster.blob_granules_enabled:
cmd += [
"--blob-granule-local-file-path",
@ -209,6 +213,12 @@ def parse_args(argv):
parser.add_argument("--build-dir", "-b", type=str, required=True, help="FDB build directory")
parser.add_argument("--api-tester-bin", type=str, help="Path to the fdb_c_api_tester executable.", required=True)
parser.add_argument("--external-client-library", type=str, help="Path to the external client library.")
parser.add_argument(
"--retain-client-lib-copies",
action="store_true",
default=False,
help="Retain temporary external client library copies.",
)
parser.add_argument(
"--cluster-file",
type=str,

View File

@ -18,7 +18,7 @@
* limitations under the License.
*/
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include "unit/fdb_api.hpp"

View File

@ -23,7 +23,7 @@
#pragma once
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <cassert>
@ -716,6 +716,12 @@ public:
throwError("Failed to create transaction: ", err);
return Transaction(tx_native);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
};
class Database {

View File

@ -7,8 +7,7 @@ import subprocess
import sys
import os
sys.path[:0] = [os.path.join(os.path.dirname(
__file__), '..', '..', '..', 'tests', 'TestRunner')]
sys.path[:0] = [os.path.join(os.path.dirname(__file__), '..', '..', '..', 'tests', 'TestRunner')]
# fmt: off
from binary_download import FdbBinaryDownloader, CURRENT_VERSION

View File

@ -131,21 +131,10 @@ repeat_immediate_steps:
iter.opName(),
iter.step,
err.what());
updateErrorStats(err, iter.op);
tx.onError(err).then([this, state = shared_from_this()](Future f) {
const auto rc = handleForOnError(tx, f, fmt::format("{}:{}", iter.opName(), iter.step));
if (rc == FutureRC::RETRY) {
stats.incrErrorCount(iter.op);
} else if (rc == FutureRC::CONFLICT) {
stats.incrConflictCount();
} else if (rc == FutureRC::ABORT) {
tx.reset();
signalEnd();
return;
}
// restart this iteration from beginning
iter = getOpBegin(args);
needs_commit = false;
postNextTick();
onIterationEnd(rc);
});
} else {
// async step succeeded
@ -159,20 +148,9 @@ repeat_immediate_steps:
}
} else {
// blob granules op error
auto rc = handleForOnError(tx, f, "BG_ON_ERROR");
if (rc == FutureRC::RETRY) {
stats.incrErrorCount(iter.op);
} else if (rc == FutureRC::CONFLICT) {
stats.incrConflictCount();
} else if (rc == FutureRC::ABORT) {
tx.reset();
stopcount.fetch_add(1);
return;
}
iter = getOpBegin(args);
needs_commit = false;
// restart this iteration from beginning
postNextTick();
updateErrorStats(f.error(), iter.op);
FutureRC rc = handleForOnError(tx, f, "BG_ON_ERROR");
onIterationEnd(rc);
}
});
}
@ -217,23 +195,10 @@ void ResumableStateForRunWorkload::onTransactionSuccess() {
"ERROR",
"Post-iteration commit returned error: {}",
err.what());
updateErrorStats(err, OP_COMMIT);
tx.onError(err).then([this, state = shared_from_this()](Future f) {
const auto rc = handleForOnError(tx, f, "ON_ERROR");
if (rc == FutureRC::CONFLICT)
stats.incrConflictCount();
else
stats.incrErrorCount(OP_COMMIT);
if (rc == FutureRC::ABORT) {
signalEnd();
return;
}
if (ended()) {
signalEnd();
} else {
iter = getOpBegin(args);
needs_commit = false;
postNextTick();
}
onIterationEnd(rc);
});
} else {
// commit successful
@ -249,13 +214,7 @@ void ResumableStateForRunWorkload::onTransactionSuccess() {
stats.incrOpCount(OP_TRANSACTION);
tx.reset();
watch_tx.startFromStop();
if (ended()) {
signalEnd();
} else {
// start next iteration
iter = getOpBegin(args);
postNextTick();
}
onIterationEnd(FutureRC::OK);
}
});
} else {
@ -268,12 +227,29 @@ void ResumableStateForRunWorkload::onTransactionSuccess() {
stats.incrOpCount(OP_TRANSACTION);
watch_tx.startFromStop();
tx.reset();
if (ended()) {
signalEnd();
onIterationEnd(FutureRC::OK);
}
}
void ResumableStateForRunWorkload::onIterationEnd(FutureRC rc) {
// restart current iteration from beginning unless ended
if (rc == FutureRC::OK || rc == FutureRC::ABORT) {
total_xacts++;
}
if (ended()) {
signalEnd();
} else {
iter = getOpBegin(args);
needs_commit = false;
postNextTick();
}
}
void ResumableStateForRunWorkload::updateErrorStats(fdb::Error err, int op) {
if (err) {
if (err.is(1020 /*not_commited*/)) {
stats.incrConflictCount();
} else {
iter = getOpBegin(args);
// start next iteration
postNextTick();
stats.incrErrorCount(op);
}
}
}

View File

@ -26,6 +26,7 @@
#include <boost/asio.hpp>
#include "logger.hpp"
#include "mako.hpp"
#include "future.hpp"
#include "shm.hpp"
#include "stats.hpp"
#include "time.hpp"
@ -79,6 +80,7 @@ struct ResumableStateForRunWorkload : std::enable_shared_from_this<ResumableStat
boost::asio::io_context& io_context;
Arguments const& args;
ThreadStatistics& stats;
int64_t total_xacts;
std::atomic<int>& stopcount;
std::atomic<int> const& signal;
int max_iters;
@ -102,20 +104,20 @@ struct ResumableStateForRunWorkload : std::enable_shared_from_this<ResumableStat
std::atomic<int> const& signal,
int max_iters,
OpIterator iter)
: logr(logr), db(db), tx(tx), io_context(io_context), args(args), stats(stats), stopcount(stopcount),
signal(signal), max_iters(max_iters), iter(iter), needs_commit(false) {
: logr(logr), db(db), tx(tx), io_context(io_context), args(args), stats(stats), total_xacts(0),
stopcount(stopcount), signal(signal), max_iters(max_iters), iter(iter), needs_commit(false) {
key1.resize(args.key_length);
key2.resize(args.key_length);
val.resize(args.value_length);
}
void signalEnd() noexcept { stopcount.fetch_add(1); }
bool ended() noexcept {
return (max_iters != -1 && max_iters >= stats.getOpCount(OP_TRANSACTION)) || signal.load() == SIGNAL_RED;
}
bool ended() noexcept { return (max_iters != -1 && total_xacts >= max_iters) || signal.load() == SIGNAL_RED; }
void postNextTick();
void runOneTick();
void updateStepStats();
void onTransactionSuccess();
void onIterationEnd(FutureRC rc);
void updateErrorStats(fdb::Error err, int op);
};
using RunWorkloadStateHandle = std::shared_ptr<ResumableStateForRunWorkload>;

View File

@ -31,21 +31,28 @@ extern thread_local mako::Logger logr;
namespace mako {
enum class FutureRC { OK, RETRY, CONFLICT, ABORT };
enum class FutureRC { OK, RETRY, ABORT };
template <class FutureType>
force_inline bool waitFuture(FutureType& f, std::string_view step) {
assert(f);
auto err = f.blockUntilReady();
if (err) {
assert(!err.retryable());
logr.error("'{}' found at blockUntilReady during step '{}'", err.what(), step);
return false;
} else {
return true;
}
}
template <class FutureType>
force_inline FutureRC handleForOnError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
if (auto err = f.error()) {
if (err.is(1020 /*not_committed*/)) {
return FutureRC::CONFLICT;
} else if (err.retryable()) {
logr.warn("Retryable error '{}' found at on_error(), step: {}", err.what(), step);
return FutureRC::RETRY;
} else {
logr.error("Unretryable error '{}' found at on_error(), step: {}", err.what(), step);
tx.reset();
return FutureRC::ABORT;
}
assert(!(err.retryable()));
logr.error("Unretryable error '{}' found at on_error(), step: {}", err.what(), step);
tx.reset();
return FutureRC::ABORT;
} else {
return FutureRC::RETRY;
}
@ -54,8 +61,7 @@ force_inline FutureRC handleForOnError(fdb::Transaction& tx, FutureType& f, std:
template <class FutureType>
force_inline FutureRC waitAndHandleForOnError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
assert(f);
if (auto err = f.blockUntilReady()) {
logr.error("'{}' found while waiting for on_error() future, step: {}", err.what(), step);
if (!waitFuture(f, step)) {
return FutureRC::ABORT;
}
return handleForOnError(tx, f, step);
@ -65,15 +71,13 @@ force_inline FutureRC waitAndHandleForOnError(fdb::Transaction& tx, FutureType&
template <class FutureType>
force_inline FutureRC waitAndHandleError(fdb::Transaction& tx, FutureType& f, std::string_view step) {
assert(f);
auto err = fdb::Error{};
if ((err = f.blockUntilReady())) {
const auto retry = err.retryable();
logr.error("{} error '{}' found during step: {}", (retry ? "Retryable" : "Unretryable"), err.what(), step);
return retry ? FutureRC::RETRY : FutureRC::ABORT;
if (!waitFuture(f, step)) {
return FutureRC::ABORT;
}
err = f.error();
if (!err)
auto err = f.error();
if (!err) {
return FutureRC::OK;
}
if (err.retryable()) {
logr.warn("step {} returned '{}'", step, err.what());
} else {

View File

@ -59,6 +59,8 @@
#include "shm.hpp"
#include "stats.hpp"
#include "time.hpp"
#include "rapidjson/document.h"
#include "rapidjson/error/en.h"
namespace mako {
@ -88,14 +90,29 @@ Transaction createNewTransaction(Database db, Arguments const& args, int id = -1
}
// Create Tenant Transaction
int tenant_id = (id == -1) ? urand(0, args.active_tenants - 1) : id;
Transaction tr;
std::string tenantStr;
// If provided tenants array, use it
if (tenants) {
return tenants[tenant_id].createTransaction();
tr = tenants[tenant_id].createTransaction();
} else {
tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
tr = t.createTransaction();
}
std::string tenantStr = "tenant" + std::to_string(tenant_id);
BytesRef tenant_name = toBytesRef(tenantStr);
Tenant t = db.openTenant(tenant_name);
return t.createTransaction();
if (!args.authorization_tokens.empty()) {
// lookup token based on tenant name and, if found, set authz token to transaction
if (tenantStr.empty())
tenantStr = "tenant" + std::to_string(tenant_id);
auto tokenMapItr = args.authorization_tokens.find(tenantStr);
if (tokenMapItr != args.authorization_tokens.end()) {
tr.setOption(FDB_TR_OPTION_AUTHORIZATION_TOKEN, tokenMapItr->second);
} else {
logr.warn("Authorization token map is not empty, but could not find token for tenant '{}'", tenantStr);
}
}
return tr;
}
uint64_t byteswapHelper(uint64_t input) {
@ -143,7 +160,7 @@ int cleanup(Database db, Arguments const& args) {
const auto rc = waitAndHandleError(tx, future_commit, "COMMIT_CLEANUP");
if (rc == FutureRC::OK) {
break;
} else if (rc == FutureRC::RETRY || rc == FutureRC::CONFLICT) {
} else if (rc == FutureRC::RETRY) {
// tx already reset
continue;
} else {
@ -266,24 +283,60 @@ int populate(Database db,
int batch_size = args.tenant_batch_size;
int batches = (args.total_tenants + batch_size - 1) / batch_size;
for (int batch = 0; batch < batches; ++batch) {
while (1) {
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
std::string tenant_str = "tenant" + std::to_string(i);
Tenant::createTenant(systemTx, toBytesRef(tenant_str));
}
auto future_commit = systemTx.commit();
const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
if (rc == FutureRC::OK) {
// Keep going with reset transaction if commit was successful
systemTx.reset();
break;
} else if (rc == FutureRC::RETRY) {
// We want to retry this batch. Transaction is already reset
} else {
// Abort
return -1;
}
}
Tenant tenants[batch_size];
fdb::TypedFuture<fdb::future_var::Bool> blobbifyResults[batch_size];
// blobbify tenant ranges explicitly
// FIXME: skip if database not configured for blob granules?
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
std::string tenant_name = "tenant" + std::to_string(i);
Tenant::createTenant(systemTx, toBytesRef(tenant_name));
std::string tenant_str = "tenant" + std::to_string(i);
BytesRef tenant_name = toBytesRef(tenant_str);
tenants[i] = db.openTenant(tenant_name);
std::string rangeEnd = "\xff";
blobbifyResults[i - (batch * batch_size)] =
tenants[i].blobbifyRange(BytesRef(), toBytesRef(rangeEnd));
}
auto future_commit = systemTx.commit();
const auto rc = waitAndHandleError(systemTx, future_commit, "CREATE_TENANT");
if (rc == FutureRC::OK) {
// Keep going with reset transaction if commit was successful
systemTx.reset();
} else if (rc == FutureRC::RETRY) {
// We want to retry this batch, so decrement the number
// and go back through the loop to get the same value
// Transaction is already reset
--batch;
} else {
// Abort
return -1;
for (int i = batch * batch_size; i < args.total_tenants && i < (batch + 1) * batch_size; ++i) {
while (true) {
// not technically an operation that's part of systemTx, but it works
const auto rc =
waitAndHandleError(systemTx, blobbifyResults[i - (batch * batch_size)], "BLOBBIFY_TENANT");
if (rc == FutureRC::OK) {
if (!blobbifyResults[i - (batch * batch_size)].get()) {
fmt::print("Blobbifying tenant {0} failed!\n", i);
return -1;
}
break;
} else if (rc == FutureRC::RETRY) {
continue;
} else {
// Abort
return -1;
}
}
}
systemTx.reset();
}
} else {
std::string last_tenant_name = "tenant" + std::to_string(args.total_tenants - 1);
@ -405,6 +458,16 @@ int populate(Database db,
return 0;
}
void updateErrorStatsRunMode(ThreadStatistics& stats, fdb::Error err, int op) {
if (err) {
if (err.is(1020 /*not_commited*/)) {
stats.incrConflictCount();
} else {
stats.incrErrorCount(op);
}
}
}
/* run one iteration of configured transaction */
int runOneTransaction(Transaction& tx,
Arguments const& args,
@ -435,17 +498,13 @@ transaction_begin:
} else {
future_rc = waitAndHandleForOnError(tx, f, opTable[op].name());
}
updateErrorStatsRunMode(stats, f.error(), op);
}
if (auto postStepFn = opTable[op].postStepFunction(step))
postStepFn(f, tx, args, key1, key2, val);
watch_step.stop();
if (future_rc != FutureRC::OK) {
if (future_rc == FutureRC::CONFLICT) {
stats.incrConflictCount();
} else if (future_rc == FutureRC::RETRY) {
stats.incrErrorCount(op);
} else {
// abort
if (future_rc == FutureRC::ABORT) {
return -1;
}
// retry from first op
@ -484,6 +543,7 @@ transaction_begin:
auto watch_commit = Stopwatch(StartAtCtor{});
auto f = tx.commit();
const auto rc = waitAndHandleError(tx, f, "COMMIT_AT_TX_END");
updateErrorStatsRunMode(stats, f.error(), OP_COMMIT);
watch_commit.stop();
auto tx_resetter = ExitGuard([&tx]() { tx.reset(); });
if (rc == FutureRC::OK) {
@ -493,10 +553,6 @@ transaction_begin:
}
stats.incrOpCount(OP_COMMIT);
} else {
if (rc == FutureRC::CONFLICT)
stats.incrConflictCount();
else
stats.incrErrorCount(OP_COMMIT);
if (rc == FutureRC::ABORT) {
return -1;
}
@ -563,62 +619,70 @@ int runWorkload(Database db,
/* main transaction loop */
while (1) {
Transaction tx = createNewTransaction(db, args, -1, args.active_tenants > 0 ? tenants : nullptr);
while ((thread_tps > 0) && (xacts >= current_tps)) {
if ((thread_tps > 0 /* iff throttling on */) && (xacts >= current_tps)) {
/* throttle on */
const auto time_now = steady_clock::now();
if (toDoubleSeconds(time_now - time_prev) >= 1.0) {
/* more than 1 second passed, no need to throttle */
xacts = 0;
time_prev = time_now;
/* update throttle rate */
current_tps = static_cast<int>(thread_tps * throttle_factor.load());
} else {
auto time_now = steady_clock::now();
while (toDoubleSeconds(time_now - time_prev) < 1.0) {
usleep(1000);
time_now = steady_clock::now();
}
/* more than 1 second passed*/
xacts = 0;
time_prev = time_now;
/* update throttle rate */
current_tps = static_cast<int>(thread_tps * throttle_factor.load());
}
/* enable transaction trace */
if (dotrace) {
const auto time_now = steady_clock::now();
if (toIntegerSeconds(time_now - time_last_trace) >= 1) {
time_last_trace = time_now;
traceid.clear();
fmt::format_to(std::back_inserter(traceid), "makotrace{:0>19d}", total_xacts);
logr.debug("txn tracing {}", traceid);
auto err = Error{};
err = tx.setOptionNothrow(FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER, toBytesRef(traceid));
if (current_tps > 0 || thread_tps == 0 /* throttling off */) {
Transaction tx = createNewTransaction(db, args, -1, args.active_tenants > 0 ? tenants : nullptr);
/* enable transaction trace */
if (dotrace) {
const auto time_now = steady_clock::now();
if (toIntegerSeconds(time_now - time_last_trace) >= 1) {
time_last_trace = time_now;
traceid.clear();
fmt::format_to(std::back_inserter(traceid), "makotrace{:0>19d}", total_xacts);
logr.debug("txn tracing {}", traceid);
auto err = Error{};
err = tx.setOptionNothrow(FDB_TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER, toBytesRef(traceid));
if (err) {
logr.error("TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER: {}", err.what());
}
err = tx.setOptionNothrow(FDB_TR_OPTION_LOG_TRANSACTION, BytesRef());
if (err) {
logr.error("TR_OPTION_LOG_TRANSACTION: {}", err.what());
}
}
}
/* enable transaction tagging */
if (dotagging > 0) {
tagstr.clear();
fmt::format_to(std::back_inserter(tagstr),
"{}{}{:0>3d}",
KEY_PREFIX,
args.txntagging_prefix,
urand(0, args.txntagging - 1));
auto err = tx.setOptionNothrow(FDB_TR_OPTION_AUTO_THROTTLE_TAG, toBytesRef(tagstr));
if (err) {
logr.error("TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER: {}", err.what());
}
err = tx.setOptionNothrow(FDB_TR_OPTION_LOG_TRANSACTION, BytesRef());
if (err) {
logr.error("TR_OPTION_LOG_TRANSACTION: {}", err.what());
}
}
}
/* enable transaction tagging */
if (dotagging > 0) {
tagstr.clear();
fmt::format_to(std::back_inserter(tagstr),
"{}{}{:0>3d}",
KEY_PREFIX,
args.txntagging_prefix,
urand(0, args.txntagging - 1));
auto err = tx.setOptionNothrow(FDB_TR_OPTION_AUTO_THROTTLE_TAG, toBytesRef(tagstr));
if (err) {
logr.error("TR_OPTION_DEBUG_TRANSACTION_IDENTIFIER: {}", err.what());
rc = runOneTransaction(tx, args, stats, key1, key2, val);
if (rc) {
logr.warn("runOneTransaction failed ({})", rc);
}
}
rc = runOneTransaction(tx, args, stats, key1, key2, val);
if (rc) {
logr.warn("runOneTransaction failed ({})", rc);
xacts++;
total_xacts++;
}
if (thread_iters != -1) {
if (thread_iters >= total_xacts) {
if (total_xacts >= thread_iters) {
/* xact limit reached */
break;
}
@ -626,8 +690,6 @@ int runWorkload(Database db,
/* signal turned red, target duration reached */
break;
}
xacts++;
total_xacts++;
}
return rc;
}
@ -710,6 +772,9 @@ void runAsyncWorkload(Arguments const& args,
args.iteration == 0
? -1
: computeThreadIters(args.iteration, worker_id, i, args.num_processes, args.async_xacts);
// argument validation should ensure max_iters > 0
assert(args.iteration == 0 || max_iters > 0);
auto state =
std::make_shared<ResumableStateForRunWorkload>(Logger(WorkerProcess{}, args.verbose, worker_id, i),
db,
@ -757,11 +822,15 @@ void workerThread(ThreadArgs& thread_args) {
const auto thread_tps =
args.tpsmax == 0 ? 0
: computeThreadTps(args.tpsmax, worker_id, thread_id, args.num_processes, args.num_threads);
// argument validation should ensure thread_tps > 0
assert(args.tpsmax == 0 || thread_tps > 0);
const auto thread_iters =
args.iteration == 0
? -1
: computeThreadIters(args.iteration, worker_id, thread_id, args.num_processes, args.num_threads);
// argument validation should ensure thread_iters > 0
assert(args.iteration == 0 || thread_iters > 0);
/* i'm ready */
readycount.fetch_add(1);
@ -815,6 +884,18 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
logr.error("network::setOption(FDB_NET_OPTION_DISTRIBUTED_CLIENT_TRACER): {}", err.what());
}
if (args.tls_certificate_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CERT_PATH, args.tls_certificate_file.value());
}
if (args.tls_key_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_KEY_PATH, args.tls_key_file.value());
}
if (args.tls_ca_file.has_value()) {
network::setOption(FDB_NET_OPTION_TLS_CA_PATH, args.tls_ca_file.value());
}
/* enable flatbuffers if specified */
if (args.flatbuffers) {
#ifdef FDB_NET_OPTION_USE_FLATBUFFERS
@ -982,57 +1063,56 @@ int workerProcessMain(Arguments const& args, int worker_id, shared_memory::Acces
}
/* initialize the parameters with default values */
int initArguments(Arguments& args) {
memset(&args, 0, sizeof(Arguments)); /* zero-out everything */
args.num_fdb_clusters = 0;
args.num_databases = 1;
args.api_version = maxApiVersion();
args.json = 0;
args.num_processes = 1;
args.num_threads = 1;
args.async_xacts = 0;
args.mode = MODE_INVALID;
args.rows = 100000;
args.load_factor = 1.0;
args.row_digits = digits(args.rows);
args.seconds = 30;
args.iteration = 0;
args.tpsmax = 0;
args.tpsmin = -1;
args.tpsinterval = 10;
args.tpschange = TPS_SIN;
args.sampling = 1000;
args.key_length = 32;
args.value_length = 16;
args.active_tenants = 0;
args.total_tenants = 0;
args.tenant_batch_size = 10000;
args.zipf = 0;
args.commit_get = 0;
args.verbose = 1;
args.flatbuffers = 0; /* internal */
args.knobs[0] = '\0';
args.log_group[0] = '\0';
args.prefixpadding = 0;
args.trace = 0;
args.tracepath[0] = '\0';
args.traceformat = 0; /* default to client's default (XML) */
args.streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
args.txntrace = 0;
args.txntagging = 0;
memset(args.txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
Arguments::Arguments() {
num_fdb_clusters = 0;
num_databases = 1;
api_version = maxApiVersion();
json = 0;
num_processes = 1;
num_threads = 1;
async_xacts = 0;
mode = MODE_INVALID;
rows = 100000;
load_factor = 1.0;
row_digits = digits(rows);
seconds = 0;
iteration = 0;
tpsmax = 0;
tpsmin = -1;
tpsinterval = 10;
tpschange = TPS_SIN;
sampling = 1000;
key_length = 32;
value_length = 16;
active_tenants = 0;
total_tenants = 0;
tenant_batch_size = 10000;
zipf = 0;
commit_get = 0;
verbose = 1;
flatbuffers = 0; /* internal */
knobs[0] = '\0';
log_group[0] = '\0';
prefixpadding = 0;
trace = 0;
tracepath[0] = '\0';
traceformat = 0; /* default to client's default (XML) */
streaming_mode = FDB_STREAMING_MODE_WANT_ALL;
txntrace = 0;
txntagging = 0;
memset(txntagging_prefix, 0, TAGPREFIXLENGTH_MAX);
for (auto i = 0; i < MAX_OP; i++) {
args.txnspec.ops[i][OP_COUNT] = 0;
txnspec.ops[i][OP_COUNT] = 0;
}
args.client_threads_per_version = 0;
args.disable_client_bypass = false;
args.disable_ryw = 0;
args.json_output_path[0] = '\0';
args.stats_export_path[0] = '\0';
args.bg_materialize_files = false;
args.bg_file_path[0] = '\0';
args.distributed_tracer_client = 0;
return 0;
client_threads_per_version = 0;
disable_client_bypass = false;
disable_ryw = 0;
json_output_path[0] = '\0';
stats_export_path[0] = '\0';
bg_materialize_files = false;
bg_file_path[0] = '\0';
distributed_tracer_client = 0;
num_report_files = 0;
}
/* parse transaction specification */
@ -1234,7 +1314,7 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
/* name, has_arg, flag, val */
{ "api_version", required_argument, NULL, 'a' },
{ "cluster", required_argument, NULL, 'c' },
{ "num_databases", optional_argument, NULL, 'd' },
{ "num_databases", required_argument, NULL, 'd' },
{ "procs", required_argument, NULL, 'p' },
{ "threads", required_argument, NULL, 't' },
{ "async_xacts", required_argument, NULL, ARG_ASYNC },
@ -1279,8 +1359,23 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
{ "bg_file_path", required_argument, NULL, ARG_BG_FILE_PATH },
{ "stats_export_path", optional_argument, NULL, ARG_EXPORT_PATH },
{ "distributed_tracer_client", required_argument, NULL, ARG_DISTRIBUTED_TRACER_CLIENT },
{ "tls_certificate_file", required_argument, NULL, ARG_TLS_CERTIFICATE_FILE },
{ "tls_key_file", required_argument, NULL, ARG_TLS_KEY_FILE },
{ "tls_ca_file", required_argument, NULL, ARG_TLS_CA_FILE },
{ "authorization_token_file", required_argument, NULL, ARG_AUTHORIZATION_TOKEN_FILE },
{ NULL, 0, NULL, 0 }
};
/* For optional arguments, optarg is only set when the argument is passed as "--option=[ARGUMENT]" but not as
"--option [ARGUMENT]". This function sets optarg in the latter case. See
https://cfengine.com/blog/2021/optional-arguments-with-getopt-long/ for a more detailed explanation */
#define SET_OPT_ARG_IF_PRESENT() \
{ \
if (optarg == NULL && optind < argc && argv[optind][0] != '-') { \
optarg = argv[optind++]; \
} \
}
idx = 0;
c = getopt_long(argc, argv, short_options, long_options, &idx);
if (c < 0) {
@ -1482,9 +1577,8 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.disable_ryw = 1;
break;
case ARG_JSON_REPORT:
if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
// if --report_json is the last option and no file is specified
// or --report_json is followed by another option
SET_OPT_ARG_IF_PRESENT();
if (!optarg) {
char default_file[] = "mako.json";
strncpy(args.json_output_path, default_file, sizeof(default_file));
} else {
@ -1495,13 +1589,12 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.bg_materialize_files = true;
strncpy(args.bg_file_path, optarg, std::min(sizeof(args.bg_file_path), strlen(optarg) + 1));
case ARG_EXPORT_PATH:
if (optarg == NULL && (argv[optind] == NULL || (argv[optind] != NULL && argv[optind][0] == '-'))) {
SET_OPT_ARG_IF_PRESENT();
if (!optarg) {
char default_file[] = "sketch_data.json";
strncpy(args.stats_export_path, default_file, sizeof(default_file));
} else {
strncpy(args.stats_export_path,
argv[optind],
std::min(sizeof(args.stats_export_path), strlen(argv[optind]) + 1));
strncpy(args.stats_export_path, optarg, std::min(sizeof(args.stats_export_path), strlen(optarg) + 1));
}
break;
case ARG_DISTRIBUTED_TRACER_CLIENT:
@ -1515,6 +1608,45 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
args.distributed_tracer_client = -1;
}
break;
case ARG_TLS_CERTIFICATE_FILE:
args.tls_certificate_file = std::string(optarg);
break;
case ARG_TLS_KEY_FILE:
args.tls_key_file = std::string(optarg);
break;
case ARG_TLS_CA_FILE:
args.tls_ca_file = std::string(optarg);
break;
case ARG_AUTHORIZATION_TOKEN_FILE: {
std::string tokenFilename(optarg);
std::ifstream ifs(tokenFilename);
std::ostringstream oss;
oss << ifs.rdbuf();
rapidjson::Document d;
d.Parse(oss.str().c_str());
if (d.HasParseError()) {
logr.error("Failed to parse authorization token JSON file '{}': {} at offset {}",
tokenFilename,
GetParseError_En(d.GetParseError()),
d.GetErrorOffset());
return -1;
} else if (!d.IsObject()) {
logr.error("Authorization token JSON file '{}' must contain a JSON object", tokenFilename);
return -1;
}
for (auto itr = d.MemberBegin(); itr != d.MemberEnd(); ++itr) {
if (!itr->value.IsString()) {
logr.error("Token '{}' is not a string", itr->name.GetString());
return -1;
}
args.authorization_tokens.insert_or_assign(
std::string(itr->name.GetString(), itr->name.GetStringLength()),
std::string(itr->value.GetString(), itr->value.GetStringLength()));
}
logr.info("Added {} tenant authorization tokens to map from file '{}'",
args.authorization_tokens.size(),
tokenFilename);
} break;
}
}
@ -1525,93 +1657,118 @@ int parseArguments(int argc, char* argv[], Arguments& args) {
return 0;
}
int validateArguments(Arguments const& args) {
if (args.mode == MODE_INVALID) {
int Arguments::validate() {
if (mode == MODE_INVALID) {
logr.error("--mode has to be set");
return -1;
}
if (args.verbose < VERBOSE_NONE || args.verbose > VERBOSE_DEBUG) {
if (verbose < VERBOSE_NONE || verbose > VERBOSE_DEBUG) {
logr.error("--verbose must be between 0 and 3");
return -1;
}
if (args.rows <= 0) {
if (rows <= 0) {
logr.error("--rows must be a positive integer");
return -1;
}
if (args.load_factor <= 0 || args.load_factor > 1) {
if (load_factor <= 0 || load_factor > 1) {
logr.error("--load_factor must be in range (0, 1]");
return -1;
}
if (args.key_length < 0) {
if (key_length < 0) {
logr.error("--keylen must be a positive integer");
return -1;
}
if (args.value_length < 0) {
if (value_length < 0) {
logr.error("--vallen must be a positive integer");
return -1;
}
if (args.num_fdb_clusters > NUM_CLUSTERS_MAX) {
if (num_fdb_clusters > NUM_CLUSTERS_MAX) {
logr.error("Mako is not supported to do work to more than {} clusters", NUM_CLUSTERS_MAX);
return -1;
}
if (args.num_databases > NUM_DATABASES_MAX) {
if (num_databases > NUM_DATABASES_MAX) {
logr.error("Mako is not supported to do work to more than {} databases", NUM_DATABASES_MAX);
return -1;
}
if (args.num_databases < args.num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", args.num_databases, args.num_fdb_clusters);
if (num_databases < num_fdb_clusters) {
logr.error("--num_databases ({}) must be >= number of clusters({})", num_databases, num_fdb_clusters);
return -1;
}
if (args.num_threads < args.num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", args.num_threads, args.num_databases);
if (num_threads < num_databases) {
logr.error("--threads ({}) must be >= number of databases ({})", num_threads, num_databases);
return -1;
}
if (args.key_length < 4 /* "mako" */ + args.row_digits) {
if (key_length < 4 /* "mako" */ + row_digits) {
logr.error("--keylen must be larger than {} to store \"mako\" prefix "
"and maximum row number",
4 + args.row_digits);
4 + row_digits);
return -1;
}
if (args.active_tenants > args.total_tenants) {
if (active_tenants > total_tenants) {
logr.error("--active_tenants must be less than or equal to --total_tenants");
return -1;
}
if (args.tenant_batch_size < 1) {
if (tenant_batch_size < 1) {
logr.error("--tenant_batch_size must be at least 1");
return -1;
}
if (args.mode == MODE_RUN) {
if ((args.seconds > 0) && (args.iteration > 0)) {
if (mode == MODE_RUN) {
if ((seconds > 0) && (iteration > 0)) {
logr.error("Cannot specify seconds and iteration together");
return -1;
}
if ((args.seconds == 0) && (args.iteration == 0)) {
if ((seconds == 0) && (iteration == 0)) {
logr.error("Must specify either seconds or iteration");
return -1;
}
if (args.txntagging < 0) {
if (txntagging < 0) {
logr.error("--txntagging must be a non-negative integer");
return -1;
}
}
// ensure that all of the files provided to mako are valid and exist
if (args.mode == MODE_REPORT) {
if (!args.num_report_files) {
logr.error("No files to merge");
}
for (int i = 0; i < args.num_report_files; i++) {
struct stat buffer;
if (stat(args.report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", args.report_files[i]);
if (iteration > 0) {
if (async_xacts > 0 && async_xacts * num_processes > iteration) {
logr.error("--async_xacts * --num_processes must be <= --iteration");
return -1;
} else if (async_xacts == 0 && num_threads * num_processes > iteration) {
logr.error("--num_threads * --num_processes must be <= --iteration");
return -1;
}
}
}
if (args.distributed_tracer_client < 0) {
logr.error("--disibuted_tracer_client must specify either (disabled, network_lossy, log_file)");
if (mode == MODE_RUN || mode == MODE_BUILD) {
if (tpsmax > 0) {
if (async_xacts > 0) {
logr.error("--tpsmax|--tps must be 0 or unspecified because throttling is not supported in async mode");
return -1;
} else if (async_xacts == 0 && num_threads * num_processes > tpsmax) {
logr.error("--num_threads * --num_processes must be <= --tpsmax|--tps");
return -1;
}
}
}
// ensure that all of the files provided to mako are valid and exist
if (mode == MODE_REPORT) {
if (!num_report_files) {
logr.error("No files to merge");
}
for (int i = 0; i < num_report_files; i++) {
struct stat buffer;
if (stat(report_files[i], &buffer) != 0) {
logr.error("Couldn't open file {}", report_files[i]);
return -1;
}
}
}
if (distributed_tracer_client < 0) {
logr.error("--distributed_tracer_client must specify either (disabled, network_lossy, log_file)");
return -1;
}
if (!authorization_tokens.empty() && !tls_ca_file.has_value()) {
logr.warn("Authorization tokens are being used without explicit TLS CA file configured");
}
return 0;
}
@ -2262,11 +2419,6 @@ int main(int argc, char* argv[]) {
auto rc = int{};
auto args = Arguments{};
rc = initArguments(args);
if (rc < 0) {
logr.error("initArguments failed");
return -1;
}
rc = parseArguments(argc, argv, args);
if (rc < 0) {
/* usage printed */
@ -2282,7 +2434,12 @@ int main(int argc, char* argv[]) {
args.total_tenants = args.active_tenants;
}
rc = validateArguments(args);
// set --seconds in case no ending condition has been set
if (args.seconds == 0 && args.iteration == 0) {
args.seconds = 30; // default value accodring to documentation
}
rc = args.validate();
if (rc < 0)
return -1;
logr.setVerbosity(args.verbose);

View File

@ -22,7 +22,7 @@
#define MAKO_HPP
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <array>
@ -30,6 +30,7 @@
#include <cassert>
#include <chrono>
#include <list>
#include <map>
#include <vector>
#include <string_view>
#include <fdb_api.hpp>
@ -79,7 +80,11 @@ enum ArgKind {
ARG_JSON_REPORT,
ARG_BG_FILE_PATH, // if blob granule files are stored locally, mako will read and materialize them if this is set
ARG_EXPORT_PATH,
ARG_DISTRIBUTED_TRACER_CLIENT
ARG_DISTRIBUTED_TRACER_CLIENT,
ARG_TLS_CERTIFICATE_FILE,
ARG_TLS_KEY_FILE,
ARG_TLS_CA_FILE,
ARG_AUTHORIZATION_TOKEN_FILE,
};
constexpr const int OP_COUNT = 0;
@ -131,6 +136,9 @@ constexpr const int MAX_REPORT_FILES = 200;
/* benchmark parameters */
struct Arguments {
Arguments();
int validate();
int api_version;
int json;
int num_processes;
@ -180,6 +188,10 @@ struct Arguments {
char report_files[MAX_REPORT_FILES][PATH_MAX];
int num_report_files;
int distributed_tracer_client;
std::optional<std::string> tls_certificate_file;
std::optional<std::string> tls_key_file;
std::optional<std::string> tls_ca_file;
std::map<std::string, std::string> authorization_tokens; // maps tenant name to token string
};
} // namespace mako

View File

@ -38,7 +38,7 @@ Arguments
| - ``build``: Populate data
| - ``run``: Run the benchmark
- | ``-c | --cluster <cluster file>``
- | ``-c | --cluster <cluster_file>``
| FDB cluster files (Required, comma-separated)
- | ``-d | --num_databases <num_databases>``
@ -125,9 +125,21 @@ Arguments
| Disable snapshot read-your-writes
- | ``--json_report`` defaults to ``mako.json``
| ``--json_report=PATH``
| ``--json_report <path>``
| Output stats to the specified json file
- | ``--tls_certificate_file <path>``
| Use TLS certificate located in ``<path>``
- | ``--tls_key_file <path>``
| Use TLS key file located in ``<path>``
- | ``--tls_ca_file <path>``
| Use TLS CA file located in ``<path>``
- | ``--authorization_token_file <path>``
| Use authorization token JSON file located in ``<path>``
| Expected content is a JSON object where each key is a tenant name and the mapped value is a token string
Transaction Specification
=========================

View File

@ -29,7 +29,7 @@
#include <inttypes.h>
#ifndef FDB_API_VERSION
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#endif
#include <foundationdb/fdb_c.h>

View File

@ -20,7 +20,7 @@
// Unit tests that test the timeouts for a disconnected cluster
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <chrono>

View File

@ -231,6 +231,10 @@ Int64Future Transaction::get_approximate_size() {
return Int64Future(fdb_transaction_get_approximate_size(tr_));
}
Int64Future Transaction::get_total_cost() {
return Int64Future(fdb_transaction_get_total_cost(tr_));
}
KeyFuture Transaction::get_versionstamp() {
return KeyFuture(fdb_transaction_get_versionstamp(tr_));
}

View File

@ -39,7 +39,7 @@
#pragma once
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <string>
@ -276,6 +276,9 @@ public:
// Returns a future which will be set to the approximate transaction size so far.
Int64Future get_approximate_size();
// Returns a future which will be set tot the transaction's total cost so far.
Int64Future get_total_cost();
// Returns a future which will be set to the versionstamp which was used by
// any versionstamp operations in the transaction.
KeyFuture get_versionstamp();

View File

@ -20,7 +20,7 @@
// Unit tests for API setup, network initialization functions from the FDB C API.
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <iostream>
#include <thread>

View File

@ -21,7 +21,7 @@
// Unit tests for the FoundationDB C API.
#include "fdb_c_options.g.h"
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include <foundationdb/fdb_c.h>
#include <assert.h>
#include <string.h>
@ -1945,6 +1945,30 @@ TEST_CASE("fdb_transaction_get_committed_version") {
}
}
TEST_CASE("fdb_transaction_get_total_cost") {
fdb::Transaction tr(db);
while (1) {
fdb::ValueFuture f1 = tr.get("foo", /*snapshot*/ false);
fdb_error_t err = wait_future(f1);
if (err) {
fdb::EmptyFuture fOnError = tr.on_error(err);
fdb_check(wait_future(fOnError));
continue;
}
fdb::Int64Future f2 = tr.get_total_cost();
err = wait_future(f2);
if (err) {
fdb::EmptyFuture fOnError = tr.on_error(err);
fdb_check(wait_future(fOnError));
continue;
}
int64_t cost;
fdb_check(f2.get(&cost));
CHECK(cost > 0);
break;
}
}
TEST_CASE("fdb_transaction_get_approximate_size") {
fdb::Transaction tr(db);
while (1) {

View File

@ -18,7 +18,7 @@
* limitations under the License.
*/
#define FDB_API_VERSION 720
#define FDB_API_VERSION 730
#include "foundationdb/fdb_c.h"
#undef DLLEXPORT
#include "workloads.h"

View File

@ -251,6 +251,11 @@ func (o NetworkOptions) SetFutureVersionClientLibrary(param string) error {
return o.setOpt(66, []byte(param))
}
// Retain temporary external client library copies that are created for enabling multi-threading.
func (o NetworkOptions) SetRetainClientLibraryCopies() error {
return o.setOpt(67, nil)
}
// Disables logging of client statistics, such as sampled transaction activity.
func (o NetworkOptions) SetDisableClientStatisticsLogging() error {
return o.setOpt(70, nil)

View File

@ -137,7 +137,7 @@ endif()
if(NOT BUILD_GO_BINDING OR NOT BUILD_C_BINDING)
set(WITH_GO_BINDING OFF)
else()
find_program(GO_EXECUTABLE go)
find_program(GO_EXECUTABLE go HINTS /usr/local/go/bin/)
# building the go binaries is currently not supported on Windows
if(GO_EXECUTABLE AND NOT WIN32 AND WITH_C_BINDING)
set(WITH_GO_BINDING ON)

View File

@ -76,38 +76,11 @@ function(generate_coverage_xml)
add_dependencies(coverage_${target_name} coveragetool)
endfunction()
# This function asserts that `versions.h` does not exist in the source
# directory. It does this in the prebuild phase of the target.
# This is an ugly hack that should make sure that cmake isn't used with
# a source directory in which FDB was previously built with `make`.
function(assert_no_version_h target)
message(STATUS "Check versions.h on ${target}")
set(target_name "${target}_versions_h_check")
if (DEFINED ENV{VERBOSE})
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMAND echo
"${CMAKE_COMMAND}" -P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
-DFILE="${CMAKE_SOURCE_DIR}/versions.h"
COMMENT "Check old build system wasn't used in source dir")
else()
add_custom_target("${target_name}"
COMMAND "${CMAKE_COMMAND}" -DFILE="${CMAKE_SOURCE_DIR}/versions.h"
-P "${CMAKE_SOURCE_DIR}/cmake/AssertFileDoesntExist.cmake"
COMMENT "Check old build system wasn't used in source dir")
endif()
add_dependencies(${target} ${target_name})
endfunction()
add_custom_target(strip_targets)
add_dependencies(packages strip_targets)
function(strip_debug_symbols target)
if (WIN32)
if(WIN32)
return()
endif()
get_target_property(target_type ${target} TYPE)
@ -146,7 +119,7 @@ function(strip_debug_symbols target)
COMMAND objcopy --verbose --only-keep-debug $<TARGET_FILE:${target}> "${out_file}.debug"
COMMAND objcopy --verbose --add-gnu-debuglink="${out_file}.debug" "${out_file}"
COMMENT "Copy debug symbols to ${out_name}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
add_custom_target(strip_${target} DEPENDS "${out_file}.debug")
else()
add_custom_target(strip_${target})
add_dependencies(strip_${target} strip_only_${target})
@ -171,7 +144,7 @@ function(copy_headers)
foreach(f IN LISTS CP_SRCS)
is_prefix(bd "${CMAKE_CURRENT_BINARY_DIR}" "${f}")
is_prefix(sd "${CMAKE_CURRENT_SOURCE_DIR}" "${f}")
if (bd OR sd)
if(bd OR sd)
continue()
endif()
is_header(hdr "${f}")
@ -180,7 +153,7 @@ function(copy_headers)
endif()
get_filename_component(fname ${f} NAME)
get_filename_component(dname ${f} DIRECTORY)
if (dname)
if(dname)
make_directory(${incl_dir}/${dname})
endif()
set(fpath "${incl_dir}/${dname}/${fname}")
@ -309,9 +282,6 @@ function(add_flow_target)
add_custom_target(${AFT_NAME}_actors DEPENDS ${generated_files})
add_dependencies(${AFT_NAME} ${AFT_NAME}_actors)
if(NOT WIN32)
assert_no_version_h(${AFT_NAME}_actors)
endif()
generate_coverage_xml(${AFT_NAME})
if(strip_target)
strip_debug_symbols(${AFT_NAME})

View File

@ -8,40 +8,43 @@ endif()
include(ExternalProject)
ExternalProject_Add(awssdk_project
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
GIT_REPOSITORY https://github.com/aws/aws-sdk-cpp.git
GIT_TAG e4b4b310d8631bc7e9a797b6ac03a73c6f210bf6 # v1.9.331
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build"
GIT_CONFIG advice.detachedHead=false
# it seems advice.detachedHead breaks something which causes aws sdk to always be rebuilt.
# This option forces to cmake to build the aws sdk only once and never attempt to update it
UPDATE_DISCONNECTED ON
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF # SDK builds shared libs by default, we want static libs
-DENABLE_TESTING=OFF
-DBUILD_ONLY=core # git repo contains SDK for every AWS product, we only want the core auth libraries
-DSIMPLE_INSTALL=ON
-DCMAKE_INSTALL_PREFIX=install # need to specify an install prefix so it doesn't install in /usr/lib - FIXME: use absolute path
-DBYO_CRYPTO=ON # we have our own crypto libraries that conflict if we let aws sdk build and link its own
-DBUILD_CURL=ON
-DBUILD_ZLIB=ON
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_FLAGS=${AWSSDK_COMPILER_FLAGS}
TEST_COMMAND ""
# the sdk build produces a ton of artifacts, with their own dependency tree, so there is a very specific dependency order they must be linked in
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-cpp-sdk-core.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-crt-cpp.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-s3.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-auth.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-event-stream.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-http.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-mqtt.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-sdkutils.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-io.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-checksums.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-compression.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-cal.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/lib64/libaws-c-common.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/curl/lib/libcurl.a"
"${CMAKE_CURRENT_BINARY_DIR}/awssdk-build/install/external-install/zlib/lib/libz.a"
)
add_library(awssdk_core STATIC IMPORTED)
add_dependencies(awssdk_core awssdk_project)

View File

@ -303,7 +303,6 @@ class TestRun:
self.stats: str | None = stats
self.expected_unseed: int | None = expected_unseed
self.use_valgrind: bool = config.use_valgrind
self.long_running: bool = config.long_running
self.old_binary_path: Path = config.old_binaries_path
self.buggify_enabled: bool = buggify_enabled
self.fault_injection_enabled: bool = True
@ -315,7 +314,7 @@ class TestRun:
# state for the run
self.retryable_error: bool = False
self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
will_restart=will_restart)
will_restart=will_restart, long_running=config.long_running)
self.run_time: int = 0
self.success = self.run()
@ -367,6 +366,11 @@ class TestRun:
command += ['-b', 'on']
if config.crash_on_error:
command.append('--crash')
if config.long_running:
# disable simulation speedup
command += ['--knob-sim-speedup-after-seconds=36000']
# disable traceTooManyLines Error MAX_TRACE_LINES
command += ['--knob-max-trace-lines=1000000000']
self.temp_path.mkdir(parents=True, exist_ok=True)
@ -376,7 +380,8 @@ class TestRun:
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
did_kill = False
timeout = 20 * config.kill_seconds if self.use_valgrind or self.long_running else config.kill_seconds
# No timeout for long running tests
timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)

View File

@ -291,11 +291,12 @@ class Summary:
def __init__(self, binary: Path, runtime: float = 0, max_rss: int | None = None,
was_killed: bool = False, uid: uuid.UUID | None = None, expected_unseed: int | None = None,
exit_code: int = 0, valgrind_out_file: Path | None = None, stats: str | None = None,
error_out: str = None, will_restart: bool = False):
error_out: str = None, will_restart: bool = False, long_running: bool = False):
self.binary = binary
self.runtime: float = runtime
self.max_rss: int | None = max_rss
self.was_killed: bool = was_killed
self.long_running = long_running
self.expected_unseed: int | None = expected_unseed
self.exit_code: int = exit_code
self.out: SummaryTree = SummaryTree('Test')
@ -396,6 +397,10 @@ class Summary:
if self.was_killed:
child = SummaryTree('ExternalTimeout')
child.attributes['Severity'] = '40'
if self.long_running:
# debugging info for long-running tests
child.attributes['LongRunning'] = '1'
child.attributes['Runtime'] = str(self.runtime)
self.out.append(child)
self.error = True
if self.max_rss is not None:

View File

@ -55,6 +55,6 @@ if __name__ == '__main__':
summary.summarize_files(files)
summary.out.dump(sys.stdout)
else:
summary = Summary(Path('bin/fdbserver'), was_killed=True)
summary = Summary(Path('bin/fdbserver'), was_killed=True, long_running=config.long_running)
summary.summarize_files(files)
summary.out.dump(sys.stdout)

View File

@ -11,16 +11,16 @@ The global tag throttler bases throttling decisions on "quotas" provided by clie
The global tag throttler cannot throttle tags to a throughput below the reserved quota, and it cannot allow throughput to exceed the total quota.
### Cost
Internally, the units for these quotas are "page costs", computed as follows. The "page cost" of a read operation is computed as:
Internally, the units for these quotas are bytes. The cost of an operation is rounded up to the nearest page size. The cost of a read operation is computed as:
```
readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
readCost = ceiling(bytesRead / CLIENT_KNOBS->READ_COST_BYTE_FACTOR) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
```
The "page cost" of a write operation is computed as:
The cost of a write operation is computed as:
```
writeCost = SERVER_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR);
writeCost = CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * ceiling(bytesWritten / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR) * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
```
Here `bytesWritten` includes cleared bytes. The size of range clears is estimated at commit time.
@ -41,12 +41,6 @@ To set the quota through `fdbcli`, run:
fdbcli> quota set <tag> [reserved_throughput|total_throughput] <bytes_per_second>
```
Note that the quotas are specified in terms of bytes/second, and internally converted to page costs:
```
page_cost_quota = ceiling(byte_quota / CLIENT_KNOBS->READ_COST_BYTE_FACTOR)
```
To clear a both reserved and total throughput quotas for a tag, run:
```

View File

@ -34,20 +34,25 @@ Commit proxies would combine idempotency IDs for transactions within a batch. Th
## Value format
```
${protocol_version}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
${protocol_version}${timestamp}(${n (1 byte)}${idempotency_id (n bytes)}${low_order_byte_of_batch_index})*
```
The batch index for each idempotency id can be reconstructed from the high order byte and low order bytes stored in the key and value, respectively. This is necessary for an "unknown_committed" transaction to recover their full version stamp. Batch index is a `short int`, i.e. 2 bytes.
The timestamp is the unix epoch stored as a little-endian signed 64-bit integer.
# Cleaning up old idempotency ids
After learning the result of an attempt to commit a transaction with an
idempotency id, the client may inform the cluster that it's no longer interested
in that id and the cluster can reclaim the space used to store the idempotency
id. The happy-path reply to a CommitTransactionRequest will say which proxy this
request should be sent to, and all idempotency ids for a database key will be
sent to the same proxy so that it can clear the key once it receives all of
them. The first proxy will also periodically clean up the oldest idempotency ids, based on a policy determined by two knobs. One knob will control the minimum lifetime of an idempotency id (i.e. don't delete anything younger than 1 day), and the other will control the target byte size of the idempotency keys (e.g. keep 100 MB of idempotency keys around).
id. The commit proxy that committed a batch is responsible for cleaning all
idempotency kv pairs from that batch, so clients must tell that specific proxy
that they're done with the id. The first proxy will also periodically clean up
the oldest idempotency ids, based on a policy determined by two knobs. One knob
will control the minimum lifetime of an idempotency id (i.e. don't delete
anything younger than 1 day), and the other will control the target byte size of
the idempotency keys (e.g. keep 100 MB of idempotency keys around).
# Commit protocol

View File

@ -49,7 +49,7 @@ master_doc = 'index'
# General information about the project.
project = u'FoundationDB'
copyright = u'2013-2021 Apple, Inc and the FoundationDB project authors'
copyright = u'2013-2022 Apple, Inc and the FoundationDB project authors'
# Load the version information from 'versions.target'
import xml.etree.ElementTree as ET

View File

@ -142,6 +142,8 @@ Here is a complete list of valid parameters:
*multipart_min_part_size* (or *minps*) - Min part size for multipart uploads.
*enable_read_cache* (or *erc*) - Whether to enable read block cache.
*read_block_size* (or *rbs*) - Block size in bytes to be used for reads.
*read_ahead_blocks* (or *rab*) - Number of blocks to read ahead of requested offset.

View File

@ -2,6 +2,12 @@
Release Notes
#############
6.3.25
======
* Fixed a transaction log data corruption bug. `(PR #8558) <https://github.com/apple/foundationdb/pull/8558>`_
* Fixed a special keyspace ``SpecialKeyRangeAsyncImpl::getRange`` bug. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_
* Fixed a special keyspace ``ConflictingKeysImpl::getRange`` bug. `(PR #7724) <https://github.com/apple/foundationdb/pull/7724>`_
6.3.24
======
* Fixed a bug where get key location can overload proxies. `(PR #6453) <https://github.com/apple/foundationdb/pull/6453>`_

View File

@ -2,6 +2,25 @@
Release Notes
#############
7.1.25
======
* Same as 7.1.24 release with AVX enabled.
7.1.24
======
* Released with AVX disabled.
* Fixed a transaction log data corruption bug. `(PR #8525) <https://github.com/apple/foundationdb/pull/8525>`_, `(PR #8562) <https://github.com/apple/foundationdb/pull/8562>`_, and `(PR #8647) <https://github.com/apple/foundationdb/pull/8647>`_
* Fixed a rare data race in transaction logs when PEEK_BATCHING_EMPTY_MSG is enabled. `(PR #8660) <https://github.com/apple/foundationdb/pull/8660>`_
* Fixed a heap-use-after-free bug in cluster controller. `(PR #8683) <https://github.com/apple/foundationdb/pull/8683>`_
* Changed consistency check to report all corruptions. `(PR #8571) <https://github.com/apple/foundationdb/pull/8571>`_
* Fixed a rare storage server crashing bug after recovery. `(PR #8468) <https://github.com/apple/foundationdb/pull/8468>`_
* Added client knob UNLINKONLOAD_FDBCLIB to control deletion of external client libraries. `(PR #8434) <https://github.com/apple/foundationdb/pull/8434>`_
* Updated the default peer latency degradation percentile to 0.5. `(PR #8370) <https://github.com/apple/foundationdb/pull/8370>`_
* Made exclusion less pessimistic when warning about low space usage. `(PR #8347) <https://github.com/apple/foundationdb/pull/8347>`_
* Added storage server readrange and update latency metrics. `(PR #8353) <https://github.com/apple/foundationdb/pull/8353>`_
* Increased the default PEER_DEGRADATION_CONNECTION_FAILURE_COUNT value to 5s. `(PR #8336) <https://github.com/apple/foundationdb/pull/8336>`_
* Increased RocksDB block cache size. `(PR #8274) <https://github.com/apple/foundationdb/pull/8274>`_
7.1.23
======
* Same as 7.1.22 release with AVX enabled.

View File

@ -132,6 +132,7 @@ enum {
OPT_DELETE_DATA,
OPT_MIN_CLEANUP_SECONDS,
OPT_USE_PARTITIONED_LOG,
OPT_ENCRYPT_FILES,
// Backup and Restore constants
OPT_PROXY,
@ -275,6 +276,7 @@ CSimpleOpt::SOption g_rgBackupStartOptions[] = {
{ OPT_BLOB_CREDENTIALS, "--blob-credentials", SO_REQ_SEP },
{ OPT_INCREMENTALONLY, "--incremental", SO_NONE },
{ OPT_ENCRYPTION_KEY_FILE, "--encryption-key-file", SO_REQ_SEP },
{ OPT_ENCRYPT_FILES, "--encrypt-files", SO_REQ_SEP },
TLS_OPTION_FLAGS,
SO_END_OF_OPTIONS
};
@ -1112,6 +1114,11 @@ static void printBackupUsage(bool devhelp) {
"and ignore the range files.\n");
printf(" --encryption-key-file"
" The AES-128-GCM key in the provided file is used for encrypting backup files.\n");
printf(" --encrypt-files 0/1"
" If passed, this argument will allow the user to override the database encryption state to "
"either enable (1) or disable (0) encryption at rest with snapshot backups. This option refers to block "
"level encryption of snapshot backups while --encryption-key-file (above) refers to file level encryption. "
"Generally, these two options should not be used together.\n");
printf(TLS_HELP);
printf(" -w, --wait Wait for the backup to complete (allowed with `start' and `discontinue').\n");
printf(" -z, --no-stop-when-done\n"
@ -2365,6 +2372,7 @@ ACTOR Future<Void> runRestore(Database db,
KeyRef(addPrefix),
KeyRef(removePrefix),
LockDB::True,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -3384,8 +3392,8 @@ int main(int argc, char* argv[]) {
bool dryRun = false;
bool restoreSystemKeys = false;
bool restoreUserKeys = false;
// TODO (Nim): Set this value when we add optional encrypt_files CLI argument to backup agent start
bool encryptionEnabled = true;
bool encryptSnapshotFilesPresent = false;
std::string traceDir = "";
std::string traceFormat = "";
std::string traceLogGroup;
@ -3559,6 +3567,25 @@ int main(int argc, char* argv[]) {
case OPT_BASEURL:
baseUrl = args->OptionArg();
break;
case OPT_ENCRYPT_FILES: {
const char* a = args->OptionArg();
int encryptFiles;
if (!sscanf(a, "%d", &encryptFiles)) {
fprintf(stderr, "ERROR: Could not parse encrypt-files `%s'\n", a);
return FDB_EXIT_ERROR;
}
if (encryptFiles != 0 && encryptFiles != 1) {
fprintf(stderr, "ERROR: encrypt-files must be either 0 or 1\n");
return FDB_EXIT_ERROR;
}
encryptSnapshotFilesPresent = true;
if (encryptFiles == 0) {
encryptionEnabled = false;
} else {
encryptionEnabled = true;
}
break;
}
case OPT_RESTORE_CLUSTERFILE_DEST:
restoreClusterFileDest = args->OptionArg();
break;
@ -3792,6 +3819,10 @@ int main(int argc, char* argv[]) {
}
}
if (encryptionKeyFile.present() && encryptSnapshotFilesPresent) {
fprintf(stderr, "WARNING: Use of --encrypt-files and --encryption-key-file together is discouraged\n");
}
// Process the extra arguments
for (int argLoop = 0; argLoop < args->FileCount(); argLoop++) {
switch (programExe) {

View File

@ -43,9 +43,9 @@ Optional<LimitType> parseLimitType(StringRef token) {
}
}
Optional<double> parseLimitValue(StringRef token) {
Optional<int64_t> parseLimitValue(StringRef token) {
try {
return std::stod(token.toString());
return std::stol(token.toString());
} catch (...) {
return {};
}
@ -63,9 +63,9 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
} else {
auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (limitType == LimitType::TOTAL) {
fmt::print("{}\n", quota.totalQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
fmt::print("{}\n", quota.totalQuota);
} else if (limitType == LimitType::RESERVED) {
fmt::print("{}\n", quota.reservedQuota * CLIENT_KNOBS->READ_COST_BYTE_FACTOR);
fmt::print("{}\n", quota.reservedQuota);
}
}
return Void();
@ -75,7 +75,7 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
}
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, double value) {
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, int64_t value) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -89,9 +89,13 @@ ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (limitType == LimitType::TOTAL) {
quota.totalQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
// Round up to nearest page size
quota.totalQuota =
((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
} else if (limitType == LimitType::RESERVED) {
quota.reservedQuota = (value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1;
// Round up to nearest page size
quota.reservedQuota =
((value - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();

View File

@ -175,11 +175,13 @@ Future<Reference<IAsyncFile>> BackupContainerS3BlobStore::readFile(const std::st
if (usesEncryption()) {
f = makeReference<AsyncFileEncrypted>(f, AsyncFileEncrypted::Mode::READ_ONLY);
}
f = makeReference<AsyncFileReadAheadCache>(f,
m_bstore->knobs.read_block_size,
m_bstore->knobs.read_ahead_blocks,
m_bstore->knobs.concurrent_reads_per_file,
m_bstore->knobs.read_cache_blocks_per_file);
if (m_bstore->knobs.enable_read_cache) {
f = makeReference<AsyncFileReadAheadCache>(f,
m_bstore->knobs.read_block_size,
m_bstore->knobs.read_ahead_blocks,
m_bstore->knobs.concurrent_reads_per_file,
m_bstore->knobs.read_cache_blocks_per_file);
}
return f;
}

View File

@ -76,6 +76,10 @@ BlobCipherMetrics::BlobCipherMetrics()
UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
getBlobMetadataLatency("GetBlobMetadataLatency",
UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
counterSets({ CounterSet(cc, "TLog"),
CounterSet(cc, "KVMemory"),
CounterSet(cc, "KVRedwood"),

View File

@ -232,10 +232,10 @@ void validateEncryptionHeaderDetails(const BlobGranuleFileEncryptionKeys& eKeys,
.detail("ExpectedHeaderSalt", header.cipherHeaderDetails.salt);
throw encrypt_header_metadata_mismatch();
}
// Validate encryption header 'cipherHeader' details sanity
if (!(header.cipherHeaderDetails.baseCipherId == eKeys.headerCipherKey->getBaseCipherId() &&
header.cipherHeaderDetails.encryptDomainId == eKeys.headerCipherKey->getDomainId() &&
header.cipherHeaderDetails.salt == eKeys.headerCipherKey->getSalt())) {
// Validate encryption header 'cipherText' details sanity
if (!(header.cipherTextDetails.baseCipherId == eKeys.textCipherKey->getBaseCipherId() &&
header.cipherTextDetails.encryptDomainId == eKeys.textCipherKey->getDomainId() &&
header.cipherTextDetails.salt == eKeys.textCipherKey->getSalt())) {
TraceEvent(SevError, "EncryptionHeader_CipherTextMismatch")
.detail("TextDomainId", eKeys.textCipherKey->getDomainId())
.detail("ExpectedTextDomainId", header.cipherTextDetails.encryptDomainId)
@ -650,12 +650,12 @@ struct IndexedBlobGranuleFile {
IndexBlobGranuleFileChunkRef chunkRef =
IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena);
ChildType child;
ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), IncludeVersion());
dataReader.deserialize(FileIdentifierFor<ChildType>::value, child, childArena);
// TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused?
return Standalone<ChildType>(child, childArena);
BinaryReader br(chunkRef.chunkBytes.get(), IncludeVersion());
Standalone<ChildType> child;
br >> child;
return child;
}
template <class Ar>
@ -751,7 +751,7 @@ Value serializeChunkedSnapshot(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) {
Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes);
@ -1020,7 +1020,7 @@ Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) {
Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes);

View File

@ -0,0 +1,109 @@
/*
* BlobMetadataUtils.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/BlobMetadataUtils.h"
#include "fmt/format.h"
#include "flow/IRandom.h"
#include "flow/flow.h"
#include "fdbclient/Knobs.h"
#include "fdbclient/S3BlobStore.h"
std::string buildPartitionPath(const std::string& url, const std::string& partition) {
ASSERT(!partition.empty());
ASSERT(partition.front() != '/');
ASSERT(partition.back() == '/');
StringRef u(url);
if (u.startsWith("file://"_sr)) {
ASSERT(u.endsWith("/"_sr));
return url + partition;
} else if (u.startsWith("blobstore://"_sr)) {
std::string resource;
std::string lastOpenError;
S3BlobStoreEndpoint::ParametersT backupParams;
std::string urlCopy = url;
Reference<S3BlobStoreEndpoint> bstore =
S3BlobStoreEndpoint::fromString(url, {}, &resource, &lastOpenError, &backupParams);
ASSERT(!resource.empty());
ASSERT(resource.back() != '/');
size_t resourceStart = url.find(resource);
ASSERT(resourceStart != std::string::npos);
return urlCopy.insert(resourceStart + resource.size(), "/" + partition);
} else {
// FIXME: support azure
throw backup_invalid_url();
}
}
// FIXME: make this (more) deterministic outside of simulation for FDBPerfKmsConnector
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName) {
Standalone<BlobMetadataDetailsRef> metadata;
metadata.domainId = domainId;
metadata.arena().dependsOn(domainName.arena());
metadata.domainName = domainName;
// 0 == no partition, 1 == suffix partitioned, 2 == storage location partitioned
int type = deterministicRandom()->randomInt(0, 3);
int partitionCount = (type == 0) ? 0 : deterministicRandom()->randomInt(2, 12);
TraceEvent ev(SevDebug, "SimBlobMetadata");
ev.detail("DomainId", domainId).detail("TypeNum", type).detail("PartitionCount", partitionCount);
if (type == 0) {
// single storage location
std::string partition = std::to_string(domainId) + "/";
metadata.base = StringRef(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("Base", metadata.base);
}
if (type == 1) {
// simulate hash prefixing in s3
metadata.base = StringRef(metadata.arena(), baseUrl);
ev.detail("Base", metadata.base);
for (int i = 0; i < partitionCount; i++) {
metadata.partitions.push_back_deep(metadata.arena(),
deterministicRandom()->randomUniqueID().shortString() + "-" +
std::to_string(domainId) + "/");
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
if (type == 2) {
// simulate separate storage location per partition
for (int i = 0; i < partitionCount; i++) {
std::string partition = std::to_string(domainId) + "_" + std::to_string(i) + "/";
metadata.partitions.push_back_deep(metadata.arena(), buildPartitionPath(baseUrl, partition));
ev.detail("P" + std::to_string(i), metadata.partitions.back());
}
}
// set random refresh + expire time
if (deterministicRandom()->coinflip()) {
metadata.refreshAt = now() + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
metadata.expireAt =
metadata.refreshAt + deterministicRandom()->random01() * CLIENT_KNOBS->BLOB_METADATA_REFRESH_INTERVAL;
} else {
metadata.refreshAt = std::numeric_limits<double>::max();
metadata.expireAt = metadata.refreshAt;
}
return metadata;
}

View File

@ -61,7 +61,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( WRONG_SHARD_SERVER_DELAY, .01 ); if( randomize && BUGGIFY ) WRONG_SHARD_SERVER_DELAY = deterministicRandom()->random01(); // FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY; // SOMEDAY: This delay can limit performance of retrieving data when the cache is mostly wrong (e.g. dumping the database after a test)
init( FUTURE_VERSION_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) FUTURE_VERSION_RETRY_DELAY = deterministicRandom()->random01();// FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY;
init( GRV_ERROR_RETRY_DELAY, 5.0 ); if( randomize && BUGGIFY ) GRV_ERROR_RETRY_DELAY = 0.01 + 5 * deterministicRandom()->random01();
init( UNKNOWN_TENANT_RETRY_DELAY, 0.0 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = deterministicRandom()->random01();
init( UNKNOWN_TENANT_RETRY_DELAY, .01 ); if( randomize && BUGGIFY ) UNKNOWN_TENANT_RETRY_DELAY = 0.01 + deterministicRandom()->random01();
init( REPLY_BYTE_LIMIT, 80000 );
init( DEFAULT_BACKOFF, .01 ); if( randomize && BUGGIFY ) DEFAULT_BACKOFF = deterministicRandom()->random01();
init( DEFAULT_MAX_BACKOFF, 1.0 );
@ -198,7 +198,6 @@ void ClientKnobs::initialize(Randomize randomize) {
init( DEFAULT_AUTO_LOGS, 3 );
init( DEFAULT_COMMIT_GRV_PROXIES_RATIO, 3 );
init( DEFAULT_MAX_GRV_PROXIES, 4 );
init( DELETE_NATIVE_LIB_AFTER_LOADING, true ); // if false, don't delete libfdb_c in tmp directory on client connect.
init( GLOBAL_CONFIG_REFRESH_BACKOFF, 0.5 );
init( GLOBAL_CONFIG_REFRESH_MAX_BACKOFF, 60.0 );
@ -220,6 +219,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5 );
init( BLOBSTORE_CONCURRENT_READS_PER_FILE, 3 );
init( BLOBSTORE_ENABLE_READ_CACHE, true );
init( BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024 );
init( BLOBSTORE_READ_AHEAD_BLOCKS, 0 );
init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 );
@ -272,7 +272,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( TAG_THROTTLE_EXPIRATION_INTERVAL, 60.0 ); if( randomize && BUGGIFY ) TAG_THROTTLE_EXPIRATION_INTERVAL = 1.0;
init( WRITE_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) WRITE_COST_BYTE_FACTOR = 4096;
init( READ_COST_BYTE_FACTOR, 16384 ); if( randomize && BUGGIFY ) READ_COST_BYTE_FACTOR = 4096;
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
// busyness reporting
init( BUSYNESS_SPIKE_START_THRESHOLD, 0.100 );
@ -281,6 +281,7 @@ void ClientKnobs::initialize(Randomize randomize) {
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
init( BG_TOO_MANY_GRANULES, 10000 );
init( BLOB_METADATA_REFRESH_INTERVAL, 3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 );
init( CHANGE_QUORUM_BAD_STATE_RETRY_DELAY, 2.0 );

View File

@ -658,7 +658,7 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
parse((&type), value);
blobGranulesEnabled = (type != 0);
} else if (ck == "encryption_at_rest_mode"_sr) {
encryptionAtRestMode = EncryptionAtRestMode::fromValue(value);
encryptionAtRestMode = EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(value));
} else {
return false;
}

View File

@ -167,6 +167,7 @@ public:
KeyBackedProperty<Key> removePrefix() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> onlyApplyMutationLogs() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> inconsistentSnapshotOnly() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<bool> unlockDBAfterRestore() { return configSpace.pack(__FUNCTION__sr); }
// XXX: Remove restoreRange() once it is safe to remove. It has been changed to restoreRanges
KeyBackedProperty<KeyRange> restoreRange() { return configSpace.pack(__FUNCTION__sr); }
KeyBackedProperty<std::vector<KeyRange>> restoreRanges() { return configSpace.pack(__FUNCTION__sr); }
@ -591,12 +592,11 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<StringRef> decryptImpl(Database cx,
StringRef headerS,
BlobCipherEncryptHeader header,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
@ -606,7 +606,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
static Future<StringRef> decrypt(Database cx,
StringRef headerS,
BlobCipherEncryptHeader headerS,
const uint8_t* dataP,
int64_t dataLen,
Arena* arena) {
@ -651,7 +651,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
}
ACTOR static Future<Void> updateEncryptionKeysCtx(EncryptedRangeFileWriter* self, KeyRef key) {
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self));
state std::pair<int64_t, TenantName> curTenantInfo = wait(getEncryptionDomainDetails(key, self->tenantCache));
state Reference<AsyncVar<ClientDBInfo> const> dbInfo = self->cx->clientInfo;
// Get text and header cipher key
@ -693,12 +693,13 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
static bool isSystemKey(KeyRef key) { return key.size() && key[0] == systemKeys.begin[0]; }
ACTOR static Future<std::pair<int64_t, TenantName>>
getEncryptionDomainDetailsImpl(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache, bool useTenantCache) {
ACTOR static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetailsImpl(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
if (isSystemKey(key)) {
return std::make_pair(SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, FDB_SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_NAME);
}
if (key.size() < TENANT_PREFIX_SIZE || !useTenantCache) {
if (key.size() < TENANT_PREFIX_SIZE) {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
KeyRef tenantPrefix = KeyRef(key.begin(), TENANT_PREFIX_SIZE);
@ -710,21 +711,10 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
return std::make_pair(FDB_DEFAULT_ENCRYPT_DOMAIN_ID, FDB_DEFAULT_ENCRYPT_DOMAIN_NAME);
}
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(KeyRef key,
EncryptedRangeFileWriter* self) {
// If tenants are disabled on a cluster then don't use the TenantEntryCache as it will result in alot of
// unnecessary cache misses. For a cluster configured in TenantMode::Optional, the backup performance may
// degrade if most of the mutations belong to an invalid tenant
TenantMode mode = self->cx->clientInfo->get().tenantMode;
bool useTenantCache = mode != TenantMode::DISABLED;
if (g_network->isSimulated() && mode == TenantMode::OPTIONAL_TENANT) {
// TODO: Currently simulation tests run with optional tenant mode but most data does not belong to any
// tenant. This results in many timeouts so disable using the tenant cache until optional tenant mode
// support with backups is more performant
useTenantCache = false;
}
CODE_PROBE(useTenantCache, "using tenant cache");
return getEncryptionDomainDetailsImpl(key, self->tenantCache, useTenantCache);
static Future<std::pair<int64_t, TenantName>> getEncryptionDomainDetails(
KeyRef key,
Reference<TenantEntryCache<Void>> tenantCache) {
return getEncryptionDomainDetailsImpl(key, tenantCache);
}
// Handles the first block and internal blocks. Ends current block if needed.
@ -816,6 +806,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
curKeyTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
endKey = StringRef(k.begin(), TENANT_PREFIX_SIZE);
}
state ValueRef newValue = StringRef();
self->lastKey = k;
self->lastValue = v;
@ -834,9 +825,9 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
if (self->lastKey.size() == 0 || k.size() == 0) {
return false;
}
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self));
state std::pair<int64_t, TenantName> prevKeyTenantInfo = wait(getEncryptionDomainDetails(self->lastKey, self));
// crossing tenant boundaries so finish the current block using only the tenant prefix of the new key
state std::pair<int64_t, TenantName> curKeyTenantInfo = wait(getEncryptionDomainDetails(k, self->tenantCache));
state std::pair<int64_t, TenantName> prevKeyTenantInfo =
wait(getEncryptionDomainDetails(self->lastKey, self->tenantCache));
if (curKeyTenantInfo.first != prevKeyTenantInfo.first) {
CODE_PROBE(true, "crossed tenant boundaries");
wait(handleTenantBondary(self, k, v, writeValue, curKeyTenantInfo));
@ -1040,11 +1031,18 @@ private:
Key lastValue;
};
void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>* results) {
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<Reference<TenantEntryCache<Void>>> tenantCache,
Optional<BlobCipherEncryptHeader> encryptHeader) {
// Read begin key, if this fails then block was invalid.
uint32_t kLen = reader->consumeNetworkUInt32();
const uint8_t* k = reader->consume(kLen);
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
state KeyRef prevKey = KeyRef(k, kLen);
state bool done = false;
state Optional<std::pair<int64_t, TenantName>> prevTenantInfo;
// Read kv pairs and end key
while (1) {
@ -1052,6 +1050,35 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
kLen = reader->consumeNetworkUInt32();
k = reader->consume(kLen);
// make sure that all keys in a block belong to exactly one tenant,
// unless its the last key in which case it can be a truncated (different) tenant prefix
if (encryptedBlock && g_network && g_network->isSimulated()) {
ASSERT(tenantCache.present());
ASSERT(encryptHeader.present());
state KeyRef curKey = KeyRef(k, kLen);
if (!prevTenantInfo.present()) {
std::pair<int64_t, TenantName> tenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, tenantCache.get()));
prevTenantInfo = tenantInfo;
}
std::pair<int64_t, TenantName> curTenantInfo =
wait(EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, tenantCache.get()));
if (!curKey.empty() && !prevKey.empty() && prevTenantInfo.get().first != curTenantInfo.first) {
ASSERT(!done);
if (curTenantInfo.first != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID &&
curTenantInfo.first != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
ASSERT(curKey.size() == TENANT_PREFIX_SIZE);
}
done = true;
}
// make sure that all keys (except possibly the last key) in a block are encrypted using the correct key
if (!prevKey.empty()) {
ASSERT(prevTenantInfo.get().first == encryptHeader.get().cipherTextDetails.encryptDomainId);
}
prevKey = curKey;
prevTenantInfo = curTenantInfo;
}
// If eof reached or first value len byte is 0xFF then a valid block end was reached.
if (reader->eof() || *reader->rptr == 0xFF) {
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef()));
@ -1072,6 +1099,8 @@ void decodeKVPairs(StringRefReader* reader, Standalone<VectorRef<KeyValueRef>>*
for (auto b : reader->remainder())
if (b != 0xFF)
throw restore_corrupted_data_padding();
return Void();
}
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
@ -1094,7 +1123,11 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
decodeKVPairs(&reader, &results);
wait(decodeKVPairs(&reader,
&results,
false,
Optional<Reference<TenantEntryCache<Void>>>(),
Optional<BlobCipherEncryptHeader>()));
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
@ -1108,7 +1141,8 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
// read encryption header
const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
StringRef header = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
// calculate the total bytes read up to (and including) the header
int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
@ -1117,7 +1151,12 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
decodeKVPairs(&reader, &results);
state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
if (g_network && g_simulator->isSimulated()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx.get(), TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache.get()->init());
}
wait(decodeKVPairs(&reader, &results, true, tenantCache, header));
} else {
throw restore_unsupported_file_version();
}
@ -1711,7 +1750,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
state bool done = false;
state int64_t nrKeys = 0;
state bool encryptionEnabled = false;
state Optional<bool> encryptionEnabled;
loop {
state RangeResultWithVersion values;
@ -1777,7 +1816,7 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(taskBucket->keepRunning(tr, task) &&
storeOrThrow(snapshotBeginVersion, backup.snapshotBeginVersion().get(tr)) &&
storeOrThrow(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(encryptionEnabled, backup.enableSnapshotBackupEncryption().get(tr)) &&
store(snapshotRangeFileCount, backup.snapshotRangeFileCount().getD(tr)));
break;
@ -1790,9 +1829,10 @@ struct BackupRangeTaskFunc : BackupTaskFuncBase {
wait(bc->writeRangeFile(snapshotBeginVersion, snapshotRangeFileCount, outVersion, blockSize));
outFile = f;
encryptionEnabled = encryptionEnabled && cx->clientInfo->get().isEncryptionEnabled;
const bool encrypted =
encryptionEnabled.present() && encryptionEnabled.get() && cx->clientInfo->get().isEncryptionEnabled;
// Initialize range file writer and write begin key
if (encryptionEnabled) {
if (encrypted) {
CODE_PROBE(true, "using encrypted snapshot file writer");
if (!tenantCache.isValid()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
@ -3398,6 +3438,8 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
state RestoreConfig restore(task);
restore.stateEnum().set(tr, ERestoreState::COMPLETED);
state bool unlockDB = wait(restore.unlockDBAfterRestore().getD(tr, Snapshot::False, true));
tr->atomicOp(metadataVersionKey, metadataVersionRequiredValue, MutationRef::SetVersionstampedValue);
// Clear the file map now since it could be huge.
restore.fileSet().clear(tr);
@ -3413,7 +3455,9 @@ struct RestoreCompleteTaskFunc : RestoreTaskFuncBase {
restore.clearApplyMutationsKeys(tr);
wait(taskBucket->finish(tr, task));
wait(unlockDatabase(tr, restore.getUid()));
if (unlockDB) {
wait(unlockDatabase(tr, restore.getUid()));
}
return Void();
}
@ -5172,6 +5216,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5245,6 +5290,7 @@ public:
restore.onlyApplyMutationLogs().set(tr, onlyApplyMutationLogs);
restore.inconsistentSnapshotOnly().set(tr, inconsistentSnapshotOnly);
restore.beginVersion().set(tr, beginVersion);
restore.unlockDBAfterRestore().set(tr, unlockDB);
if (BUGGIFY && restoreRanges.size() == 1) {
restore.restoreRange().set(tr, restoreRanges[0]);
} else {
@ -5836,6 +5882,7 @@ public:
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -5892,6 +5939,7 @@ public:
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6017,7 +6065,7 @@ public:
}
}
Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
state Reference<IBackupContainer> bc = wait(backupConfig.backupContainer().getOrThrow(cx.getReference()));
if (fastRestore) {
TraceEvent("AtomicParallelRestoreStartRestore").log();
@ -6043,24 +6091,80 @@ public:
return -1;
} else {
TraceEvent("AS_StartRestore").log();
Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
ranges,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
state Standalone<VectorRef<KeyRangeRef>> restoreRange;
state Standalone<VectorRef<KeyRangeRef>> systemRestoreRange;
bool encryptionEnabled = cx->clientInfo->get().isEncryptionEnabled;
for (auto r : ranges) {
if (!encryptionEnabled || !r.intersects(getSystemBackupRanges())) {
restoreRange.push_back_deep(restoreRange.arena(), r);
} else {
KeyRangeRef normalKeyRange = r & normalKeys;
KeyRangeRef systemKeyRange = r & systemKeys;
if (!normalKeyRange.empty()) {
restoreRange.push_back_deep(restoreRange.arena(), normalKeyRange);
}
if (!systemKeyRange.empty()) {
systemRestoreRange.push_back_deep(systemRestoreRange.arena(), systemKeyRange);
}
}
}
if (!systemRestoreRange.empty()) {
// restore system keys
wait(success(restore(backupAgent,
cx,
cx,
"system_restore"_sr,
KeyRef(bc->getURL()),
bc->getProxy(),
systemRestoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::False,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid)));
state Reference<ReadYourWritesTransaction> rywTransaction =
Reference<ReadYourWritesTransaction>(new ReadYourWritesTransaction(cx));
// clear old restore config associated with system keys
loop {
try {
rywTransaction->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
rywTransaction->setOption(FDBTransactionOptions::LOCK_AWARE);
state RestoreConfig oldRestore(randomUid);
oldRestore.clear(rywTransaction);
wait(rywTransaction->commit());
break;
} catch (Error& e) {
wait(rywTransaction->onError(e));
}
}
}
// restore user data
state Version ver = wait(restore(backupAgent,
cx,
cx,
tagName,
KeyRef(bc->getURL()),
bc->getProxy(),
restoreRange,
WaitForComplete::True,
::invalidVersion,
Verbose::True,
addPrefix,
removePrefix,
LockDB::True,
UnlockDB::True,
OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly::False,
::invalidVersion,
{},
randomUid));
return ver;
}
}
@ -6120,6 +6224,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
Key addPrefix,
Key removePrefix,
LockDB lockDB,
UnlockDB unlockDB,
OnlyApplyMutationLogs onlyApplyMutationLogs,
InconsistentSnapshotOnly inconsistentSnapshotOnly,
Version beginVersion,
@ -6137,6 +6242,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
unlockDB,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,
@ -6178,6 +6284,7 @@ Future<Version> FileBackupAgent::restore(Database cx,
addPrefix,
removePrefix,
lockDB,
UnlockDB::True,
onlyApplyMutationLogs,
inconsistentSnapshotOnly,
beginVersion,

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.cpp
* IdempotencyId.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
@ -18,9 +18,11 @@
* limitations under the License.
*/
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/ReadYourWrites.h"
#include "fdbclient/SystemData.h"
#include "flow/UnitTest.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct IdempotencyIdKVBuilderImpl {
Optional<Version> commitVersion;
@ -40,6 +42,7 @@ void IdempotencyIdKVBuilder::add(const IdempotencyIdRef& id, uint16_t batchIndex
ASSERT((batchIndex >> 8) == impl->batchIndexHighOrderByte.get());
} else {
impl->batchIndexHighOrderByte = batchIndex >> 8;
impl->value << int64_t(now());
}
StringRef s = id.asStringRefUnsafe();
impl->value << uint8_t(s.size());
@ -53,19 +56,17 @@ Optional<KeyValue> IdempotencyIdKVBuilder::buildAndClear() {
return {};
}
BinaryWriter key{ Unversioned() };
key.serializeBytes(idempotencyIdKeys.begin);
key << bigEndian64(impl->commitVersion.get());
key << impl->batchIndexHighOrderByte.get();
Value v = impl->value.toValue();
KeyRef key =
makeIdempotencySingleKeyRange(v.arena(), impl->commitVersion.get(), impl->batchIndexHighOrderByte.get()).begin;
impl->value = BinaryWriter(IncludeVersion());
impl->batchIndexHighOrderByte = Optional<uint8_t>();
Optional<KeyValue> result = KeyValue();
result.get().arena() = v.arena();
result.get().key = key.toValue(result.get().arena());
result.get().key = key;
result.get().value = v;
return result;
}
@ -86,6 +87,8 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
// Even if id is a substring of value, it may still not actually contain it.
BinaryReader reader(kv.value.begin(), kv.value.size(), IncludeVersion());
int64_t timestamp; // ignored
reader >> timestamp;
while (!reader.empty()) {
uint8_t length;
reader >> length;
@ -93,13 +96,9 @@ Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const Idem
uint8_t lowOrderBatchIndex;
reader >> lowOrderBatchIndex;
if (candidate == needle) {
BinaryReader reader(kv.key.begin(), kv.key.size(), Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
Version commitVersion;
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
uint8_t highOrderBatchIndex;
reader >> highOrderBatchIndex;
decodeIdempotencyKey(kv.key, commitVersion, highOrderBatchIndex);
return CommitResult{ commitVersion,
static_cast<uint16_t>((uint16_t(highOrderBatchIndex) << 8) |
uint16_t(lowOrderBatchIndex)) };
@ -172,4 +171,35 @@ TEST_CASE("/fdbclient/IdempotencyId/serialization") {
ASSERT(t == id);
}
return Void();
}
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex) {
static const auto size =
idempotencyIdKeys.begin.size() + sizeof(version) + sizeof(highOrderBatchIndex) + /*\x00*/ 1;
StringRef second = makeString(size, arena);
auto* dst = mutateString(second);
memcpy(dst, idempotencyIdKeys.begin.begin(), idempotencyIdKeys.begin.size());
dst += idempotencyIdKeys.begin.size();
version = bigEndian64(version);
memcpy(dst, &version, sizeof(version));
dst += sizeof(version);
*dst++ = highOrderBatchIndex;
*dst++ = 0;
ASSERT_EQ(dst - second.begin(), size);
return KeyRangeRef(second.removeSuffix("\x00"_sr), second);
}
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex) {
BinaryReader reader(key, Unversioned());
reader.readBytes(idempotencyIdKeys.begin.size());
reader >> commitVersion;
commitVersion = bigEndian64(commitVersion);
reader >> highOrderBatchIndex;
}

View File

@ -2639,7 +2639,8 @@ TEST_CASE("/ManagementAPI/AutoQuorumChange/checkLocality") {
ProcessClass(ProcessClass::CoordinatorClass, ProcessClass::CommandLineSource),
"",
"",
currentProtocolVersion());
currentProtocolVersion(),
false);
}
workers.push_back(data);

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "flow/Trace.h"
#ifdef ADDRESS_SANITIZER
#include <sanitizer/lsan_interface.h>
#endif
@ -414,6 +415,20 @@ Version DLTransaction::getCommittedVersion() {
return version;
}
ThreadFuture<int64_t> DLTransaction::getTotalCost() {
if (!api->transactionGetTotalCost) {
return unsupported_operation();
}
FdbCApi::FDBFuture* f = api->transactionGetTotalCost(tr);
return toThreadFuture<int64_t>(api, f, [](FdbCApi::FDBFuture* f, FdbCApi* api) {
int64_t size = 0;
FdbCApi::fdb_error_t error = api->futureGetInt64(f, &size);
ASSERT(!error);
return size;
});
}
ThreadFuture<int64_t> DLTransaction::getApproximateSize() {
if (!api->transactionGetApproximateSize) {
return unsupported_operation();
@ -950,6 +965,11 @@ void DLApi::init() {
fdbCPath,
"fdb_transaction_get_committed_version",
headerVersion >= 0);
loadClientFunction(&api->transactionGetTotalCost,
lib,
fdbCPath,
"fdb_transaction_get_total_cost",
headerVersion >= ApiVersion::withGetTotalCost().version());
loadClientFunction(&api->transactionGetApproximateSize,
lib,
fdbCPath,
@ -1486,6 +1506,12 @@ ThreadFuture<SpanContext> MultiVersionTransaction::getSpanContext() {
return SpanContext();
}
ThreadFuture<int64_t> MultiVersionTransaction::getTotalCost() {
auto tr = getTransaction();
auto f = tr.transaction ? tr.transaction->getTotalCost() : makeTimeout<int64_t>();
return abortableFuture(f, tr.onChange);
}
ThreadFuture<int64_t> MultiVersionTransaction::getApproximateSize() {
auto tr = getTransaction();
auto f = tr.transaction ? tr.transaction->getApproximateSize() : makeTimeout<int64_t>();
@ -1863,6 +1889,9 @@ void MultiVersionDatabase::setOption(FDBDatabaseOptions::Option option, Optional
TraceEvent("UnknownDatabaseOption").detail("Option", option);
throw invalid_option();
}
if (itr->first == FDBDatabaseOptions::USE_CONFIG_DATABASE) {
dbState->isConfigDB = true;
}
int defaultFor = itr->second.defaultFor;
if (defaultFor >= 0) {
@ -1969,7 +1998,7 @@ ThreadFuture<ProtocolVersion> MultiVersionDatabase::getServerProtocol(Optional<P
MultiVersionDatabase::DatabaseState::DatabaseState(ClusterConnectionRecord const& connectionRecord,
Reference<IDatabase> versionMonitorDb)
: dbVar(new ThreadSafeAsyncVar<Reference<IDatabase>>(Reference<IDatabase>(nullptr))),
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false) {}
connectionRecord(connectionRecord), versionMonitorDb(versionMonitorDb), closed(false), isConfigDB(false) {}
// Adds a client (local or externally loaded) that can be used to connect to the cluster
void MultiVersionDatabase::DatabaseState::addClient(Reference<ClientInfo> client) {
@ -2167,8 +2196,12 @@ void MultiVersionDatabase::DatabaseState::updateDatabase(Reference<IDatabase> ne
.detail("ConnectionRecord", connectionRecord);
}
}
// Verify the database has the necessary functionality to update the shared
// state. Avoid updating the shared state if the database is a
// configuration database, because a configuration database does not have
// access to typical system keys and does not need to be updated.
if (db.isValid() && dbProtocolVersion.present() &&
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap()) {
MultiVersionApi::api->getApiVersion().hasClusterSharedStateMap() && !isConfigDB) {
Future<std::string> updateResult =
MultiVersionApi::api->updateClusterSharedStateMap(connectionRecord, dbProtocolVersion.get(), db);
sharedStateUpdater = map(errorOr(updateResult), [this](ErrorOr<std::string> result) {
@ -2616,6 +2649,9 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
} else if (option == FDBNetworkOptions::TRACE_SHARE_AMONG_CLIENT_THREADS) {
validateOption(value, false, true);
traceShareBaseNameAmongThreads = true;
} else if (option == FDBNetworkOptions::RETAIN_CLIENT_LIBRARY_COPIES) {
validateOption(value, false, true);
retainClientLibCopies = true;
} else {
forwardOption = true;
}
@ -2661,7 +2697,7 @@ void MultiVersionApi::setupNetwork() {
externalClients[filename] = {};
auto libCopies = copyExternalLibraryPerThread(path);
for (int idx = 0; idx < libCopies.size(); ++idx) {
bool unlinkOnLoad = libCopies[idx].second && CLIENT_KNOBS->DELETE_NATIVE_LIB_AFTER_LOADING;
bool unlinkOnLoad = libCopies[idx].second && !retainClientLibCopies;
externalClients[filename].push_back(Reference<ClientInfo>(
new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
path,
@ -2780,11 +2816,19 @@ void MultiVersionApi::runNetwork() {
});
}
localClient->api->runNetwork();
try {
localClient->api->runNetwork();
} catch (const Error& e) {
closeTraceFile();
throw e;
}
for (auto h : handles) {
waitThread(h);
}
TraceEvent("MultiVersionRunNetworkTerminating");
closeTraceFile();
}
void MultiVersionApi::stopNetwork() {
@ -3066,8 +3110,8 @@ void MultiVersionApi::loadEnvironmentVariableNetworkOptions() {
MultiVersionApi::MultiVersionApi()
: callbackOnMainThread(true), localClientDisabled(false), networkStartSetup(false), networkSetup(false),
disableBypass(false), bypassMultiClientApi(false), externalClient(false), apiVersion(0), threadCount(0),
tmpDir("/tmp"), traceShareBaseNameAmongThreads(false), envOptionsLoaded(false) {}
disableBypass(false), bypassMultiClientApi(false), externalClient(false), retainClientLibCopies(false),
apiVersion(0), threadCount(0), tmpDir("/tmp"), traceShareBaseNameAmongThreads(false), envOptionsLoaded(false) {}
MultiVersionApi* MultiVersionApi::api = new MultiVersionApi();

View File

@ -3456,6 +3456,8 @@ ACTOR Future<Optional<Value>> getValue(Reference<TransactionState> trState,
}
trState->cx->getValueCompleted->latency = timer_int() - startTime;
trState->cx->getValueCompleted->log();
trState->totalCost +=
getReadOperationCost(key.size() + (reply.value.present() ? reply.value.get().size() : 0));
if (getValueID.present()) {
g_traceBatch.addEvent("GetValueDebug",
@ -4015,6 +4017,7 @@ Future<RangeResultFamily> getExactRange(Reference<TransactionState> trState,
req.version = version;
req.begin = firstGreaterOrEqual(range.begin);
req.end = firstGreaterOrEqual(range.end);
setMatchIndex<GetKeyValuesFamilyRequest>(req, matchIndex);
req.spanContext = span.context;
trState->cx->getLatestCommitVersions(
@ -4284,6 +4287,7 @@ void getRangeFinished(Reference<TransactionState> trState,
RangeResultFamily result) {
int64_t bytes = getRangeResultFamilyBytes(result);
trState->totalCost += getReadOperationCost(bytes);
trState->cx->transactionBytesRead += bytes;
trState->cx->transactionKeysRead += result.size();
@ -4352,6 +4356,7 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
state KeySelector originalEnd = end;
state RangeResultFamily output;
state Span span("NAPI:getRange"_loc, trState->spanContext);
state Optional<UID> getRangeID = Optional<UID>();
if (useTenant && trState->tenant().present()) {
span.addAttribute("tenant"_sr, trState->tenant().get());
}
@ -4436,11 +4441,14 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
req.tags = trState->cx->sampleReadTags() ? trState->options.readTags : Optional<TagSet>();
req.spanContext = span.context;
if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
getRangeID = nondeterministicRandom()->randomUniqueID();
g_traceBatch.addAttach(
"TransactionAttachID", trState->readOptions.get().debugID.get().first(), getRangeID.get().first());
}
try {
if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
g_traceBatch.addEvent("TransactionDebug",
trState->readOptions.get().debugID.get().first(),
"NativeAPI.getRange.Before");
if (getRangeID.present()) {
g_traceBatch.addEvent("TransactionDebug", getRangeID.get().first(), "NativeAPI.getRange.Before");
/*TraceEvent("TransactionDebugGetRangeInfo", trState->readOptions.debugID.get())
.detail("ReqBeginKey", req.begin.getKey())
.detail("ReqEndKey", req.end.getKey())
@ -4480,9 +4488,9 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
throw;
}
if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
if (getRangeID.present()) {
g_traceBatch.addEvent("TransactionDebug",
trState->readOptions.get().debugID.get().first(),
getRangeID.get().first(),
"NativeAPI.getRange.After"); //.detail("SizeOf", rep.data.size());
/*TraceEvent("TransactionDebugGetRangeDone", trState->readOptions.debugID.get())
.detail("ReqBeginKey", req.begin.getKey())
@ -4596,11 +4604,9 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
}
} catch (Error& e) {
if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) {
g_traceBatch.addEvent("TransactionDebug",
trState->readOptions.get().debugID.get().first(),
"NativeAPI.getRange.Error");
TraceEvent("TransactionDebugError", trState->readOptions.get().debugID.get()).error(e);
if (getRangeID.present()) {
g_traceBatch.addEvent("TransactionDebug", getRangeID.get().first(), "NativeAPI.getRange.Error");
TraceEvent("TransactionDebugError", getRangeID.get()).error(e);
}
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed ||
(e.code() == error_code_transaction_too_old && readVersion == latestVersion)) {
@ -5766,6 +5772,7 @@ void Transaction::set(const KeyRef& key, const ValueRef& value, AddConflictRange
auto r = singleKeyRange(key, req.arena);
auto v = ValueRef(req.arena, value);
t.mutations.emplace_back(req.arena, MutationRef::SetValue, r.begin, v);
trState->totalCost += getWriteOperationCost(key.expectedSize() + value.expectedSize());
if (addConflictRange) {
t.write_conflict_ranges.push_back(req.arena, r);
@ -5795,6 +5802,7 @@ void Transaction::atomicOp(const KeyRef& key,
auto v = ValueRef(req.arena, operand);
t.mutations.emplace_back(req.arena, operationType, r.begin, v);
trState->totalCost += getWriteOperationCost(key.expectedSize());
if (addConflictRange && operationType != MutationRef::SetVersionstampedKey)
t.write_conflict_ranges.push_back(req.arena, r);
@ -5826,7 +5834,10 @@ void Transaction::clear(const KeyRangeRef& range, AddConflictRange addConflictRa
return;
t.mutations.emplace_back(req.arena, MutationRef::ClearRange, r.begin, r.end);
// NOTE: The throttling cost of each clear is assumed to be one page.
// This makes compuation fast, but can be inaccurate and may
// underestimate the cost of large clears.
trState->totalCost += CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR;
if (addConflictRange)
t.write_conflict_ranges.push_back(req.arena, r);
}
@ -6142,6 +6153,7 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
IdempotencyIdRef idempotencyId) {
state Transaction tr(trState->cx);
state int retries = 0;
state Version expiredVersion;
state Span span("NAPI:determineCommitStatus"_loc, trState->spanContext);
tr.span.setParent(span.context);
loop {
@ -6151,11 +6163,19 @@ ACTOR static Future<Optional<CommitResult>> determineCommitStatus(Reference<Tran
tr.trState->authToken = trState->authToken;
tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE);
KeyBackedObjectProperty<IdempotencyIdsExpiredVersion, _Unversioned> expiredKey(idempotencyIdsExpiredVersion,
Unversioned());
IdempotencyIdsExpiredVersion expiredVal = wait(expiredKey.getD(&tr));
expiredVersion = expiredVal.expired;
if (expiredVersion >= minPossibleCommitVersion) {
throw commit_unknown_result_fatal();
}
Version rv = wait(tr.getReadVersion());
TraceEvent("DetermineCommitStatusAttempt")
.detail("IdempotencyId", idempotencyId.asStringRefUnsafe())
.detail("Retries", retries)
.detail("ReadVersion", rv)
.detail("ExpiredVersion", expiredVersion)
.detail("MinPossibleCommitVersion", minPossibleCommitVersion)
.detail("MaxPossibleCommitVersion", maxPossibleCommitVersion);
KeyRange possibleRange =
@ -6230,14 +6250,14 @@ ACTOR Future<Optional<ClientTrCommitCostEstimation>> estimateCommitCosts(Referen
state int i = 0;
for (; i < transaction->mutations.size(); ++i) {
auto* it = &transaction->mutations[i];
auto const& mutation = transaction->mutations[i];
if (it->type == MutationRef::Type::SetValue || it->isAtomicOp()) {
if (mutation.type == MutationRef::Type::SetValue || mutation.isAtomicOp()) {
trCommitCosts.opsCount++;
trCommitCosts.writeCosts += getWriteOperationCost(it->expectedSize());
} else if (it->type == MutationRef::Type::ClearRange) {
trCommitCosts.writeCosts += getWriteOperationCost(mutation.expectedSize());
} else if (mutation.type == MutationRef::Type::ClearRange) {
trCommitCosts.opsCount++;
keyRange = KeyRangeRef(it->param1, it->param2);
keyRange = KeyRangeRef(mutation.param1, mutation.param2);
if (trState->options.expensiveClearCostEstimation) {
StorageMetrics m = wait(trState->cx->getStorageMetrics(keyRange, CLIENT_KNOBS->TOO_MANY, trState));
trCommitCosts.clearIdxCosts.emplace_back(i, getWriteOperationCost(m.bytes));
@ -6366,8 +6386,11 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
}
if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) {
wait(store(req.transaction.read_snapshot, readVersion) &&
store(req.commitCostEstimation, estimateCommitCosts(trState, &req.transaction)));
state Future<Optional<ClientTrCommitCostEstimation>> commitCostFuture =
estimateCommitCosts(trState, &req.transaction);
// We need to wait for the read version first so that we can be notified if the database is locked
wait(store(req.transaction.read_snapshot, readVersion));
wait(store(req.commitCostEstimation, commitCostFuture));
} else {
wait(store(req.transaction.read_snapshot, readVersion));
}
@ -6399,6 +6422,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
req.debugID = commitID;
state Future<CommitID> reply;
// Only gets filled in in the happy path where we don't have to commit on the first proxy or use provisional
// proxies
state int alternativeChosen = -1;
// Only valid if alternativeChosen >= 0
state Reference<CommitProxyInfo> proxiesUsed;
if (trState->options.commitOnFirstProxy) {
if (trState->cx->clientInfo->get().firstCommitProxy.present()) {
reply = throwErrorOr(brokenPromiseToMaybeDelivered(
@ -6409,11 +6438,13 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
: Never();
}
} else {
reply = basicLoadBalance(trState->cx->getCommitProxies(trState->useProvisionalProxies),
proxiesUsed = trState->cx->getCommitProxies(trState->useProvisionalProxies);
reply = basicLoadBalance(proxiesUsed,
&CommitProxyInterface::commit,
req,
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::True);
AtMostOnce::True,
&alternativeChosen);
}
state double grvTime = now();
choose {
@ -6463,6 +6494,12 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
ci.version,
req,
trState->tenant()));
if (trState->automaticIdempotency && alternativeChosen >= 0) {
// Automatic idempotency means we're responsible for best effort idempotency id clean up
proxiesUsed->getInterface(alternativeChosen)
.expireIdempotencyId.send(ExpireIdempotencyIdRequest{
ci.version, uint8_t(ci.txnBatchId >> 8), trState->getTenantInfo() });
}
return Void();
} else {
// clear the RYW transaction which contains previous conflicting keys
@ -6948,11 +6985,16 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
throw e;
}
tr.idempotencyId = IdempotencyIdRef(tr.arena, IdempotencyIdRef(value.get()));
trState->automaticIdempotency = false;
break;
case FDBTransactionOptions::AUTOMATIC_IDEMPOTENCY:
validateOptionValueNotPresent(value);
tr.idempotencyId = IdempotencyIdRef(
tr.arena, IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
if (!tr.idempotencyId.valid()) {
tr.idempotencyId = IdempotencyIdRef(
tr.arena,
IdempotencyIdRef(BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned())));
}
trState->automaticIdempotency = true;
break;
default:
@ -7519,12 +7561,11 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
Optional<Reference<TransactionState>> trState);
ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
TenantInfo tenantInfo,
KeyRange keys,
Reference<LocationInfo> locationInfo,
TenantMapEntry tenantEntry,
Optional<Reference<TransactionState>> trState) {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
try {
WaitMetricsRequest req(tenantInfo, keys, StorageMetrics(), StorageMetrics());
req.min.bytes = 0;
@ -7533,12 +7574,16 @@ ACTOR Future<StorageMetrics> doGetStorageMetrics(Database cx,
locationInfo->locations(), &StorageServerInterface::waitMetrics, req, TaskPriority::DataDistribution));
return m;
} catch (Error& e) {
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
cx->invalidateCache(tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
} else if (e.code() == error_code_unknown_tenant && trState.present() &&
tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
wait(trState.get()->handleUnknownTenant());
} else {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
cx->invalidateCache(tenantEntry.prefix, keys);
StorageMetrics m = wait(getStorageMetricsLargeKeyRange(cx, keys, trState));
return m;
@ -7569,7 +7614,7 @@ ACTOR Future<StorageMetrics> getStorageMetricsLargeKeyRange(Database cx,
partBegin = (i == 0) ? keys.begin : locations[i].range.begin;
partEnd = (i == nLocs - 1) ? keys.end : locations[i].range.end;
fx[i] = doGetStorageMetrics(
cx, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
cx, tenantInfo, KeyRangeRef(partBegin, partEnd), locations[i].locations, locations[i].tenantEntry, trState);
}
wait(waitForAll(fx));
for (int i = 0; i < nLocs; i++) {
@ -7724,27 +7769,18 @@ ACTOR Future<Optional<StorageMetrics>> waitStorageMetricsWithLocation(TenantInfo
StorageMetrics min,
StorageMetrics max,
StorageMetrics permittedError) {
try {
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
StorageMetrics x = wait(fx);
return x;
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() != error_code_wrong_shard_server && e.code() != error_code_all_alternatives_failed) {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
Future<StorageMetrics> fx;
if (locations.size() > 1) {
fx = waitStorageMetricsMultipleLocations(tenantInfo, locations, min, max, permittedError);
} else {
WaitMetricsRequest req(tenantInfo, keys, min, max);
fx = loadBalance(locations[0].locations->locations(),
&StorageServerInterface::waitMetrics,
req,
TaskPriority::DataDistribution);
}
return Optional<StorageMetrics>();
StorageMetrics x = wait(fx);
return x;
}
ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
@ -7757,9 +7793,9 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
int expectedShardCount,
Optional<Reference<TransactionState>> trState) {
state Span span("NAPI:WaitStorageMetrics"_loc, generateSpanID(cx->transactionTracingSample));
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
loop {
state TenantInfo tenantInfo =
wait(trState.present() ? populateAndGetTenant(trState.get(), keys.begin, latestVersion) : TenantInfo());
state std::vector<KeyRangeLocationInfo> locations =
wait(getKeyRangeLocations(cx,
tenantInfo,
@ -7789,13 +7825,25 @@ ACTOR Future<std::pair<Optional<StorageMetrics>, int>> waitStorageMetrics(
continue;
}
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
try {
Optional<StorageMetrics> res =
wait(waitStorageMetricsWithLocation(tenantInfo, keys, locations, min, max, permittedError));
if (res.present()) {
return std::make_pair(res, -1);
}
} catch (Error& e) {
TraceEvent(SevDebug, "WaitStorageMetricsError").error(e);
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
} else if (e.code() == error_code_unknown_tenant && trState.present() &&
tenantInfo.tenantId != TenantInfo::INVALID_TENANT) {
wait(trState.get()->handleUnknownTenant());
} else {
TraceEvent(SevError, "WaitStorageMetricsError").error(e);
throw;
}
}
cx->invalidateCache(locations[0].tenantEntry.prefix, keys);
wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, TaskPriority::DataDistribution));
}
}
@ -7965,6 +8013,21 @@ ACTOR Future<TenantMapEntry> blobGranuleGetTenantEntry(Transaction* self,
return tme;
}
// Tenant's are supposed to be unique and therefore can be loaded once.
// There is an assumption that a tenant exists as long as operations are happening against said tenant.
ACTOR Future<TenantMapEntry> blobLoadTenantMapEntry(Database* db, Key rangeStartKey, Optional<TenantName> tenantName) {
state Transaction tr(*db);
loop {
try {
TenantMapEntry tme = wait(blobGranuleGetTenantEntry(&tr, rangeStartKey, tenantName));
return tme;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
Future<Standalone<VectorRef<KeyRef>>> Transaction::getRangeSplitPoints(KeyRange const& keys, int64_t chunkSize) {
return ::getRangeSplitPoints(
trState, keys, chunkSize, readVersion.isValid() && readVersion.isReady() ? readVersion.get() : latestVersion);
@ -8436,7 +8499,6 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
state Version readVersionOut = invalidVersion;
state int batchSize = BUGGIFY ? deterministicRandom()->randomInt(2, 10) : CLIENT_KNOBS->BG_TOO_MANY_GRANULES / 2;
state int loadSize = (BUGGIFY ? deterministicRandom()->randomInt(1, 20) : 20) * batchSize;
state bool loadedTenantEntry = false;
if (version.present()) {
if (version.get() == latestVersion) {
@ -8456,16 +8518,16 @@ ACTOR Future<Version> verifyBlobRangeActor(Reference<DatabaseContext> cx,
}
}
if (tenantName.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
range = range.withPrefix(tme.prefix);
curRegion = KeyRangeRef(range.begin, range.begin);
}
loop {
if (curRegion.begin >= range.end) {
return readVersionOut;
}
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
curRegion = KeyRangeRef(range.begin, range.begin);
}
loop {
try {
wait(store(allRanges, tr.getBlobGranuleRanges(KeyRangeRef(curRegion.begin, range.end), loadSize)));
@ -9328,7 +9390,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
mismatchEvent.detail("EndKey", request.range.end);
mismatchEvent.detail("CanReadPopped", request.canReadPopped);
mismatchEvent.detail("PopVersion", popVersion);
mismatchEvent.detail("DebugUID", request.debugUID);
mismatchEvent.detail("DebugUID", request.id);
// mismatch info
mismatchEvent.detail("MatchesFound", matchesFound);
@ -9354,7 +9416,7 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
"TSSMismatchChangeFeedStream");
summaryEvent.detail("TSSID", tssData.tssId)
.detail("MismatchId", mismatchUID)
.detail("FeedDebugUID", request.debugUID);
.detail("FeedDebugUID", request.id);
}
}
}
@ -9879,7 +9941,8 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state std::vector<Future<Void>> fetchers(interfs.size());
state std::vector<Future<Void>> onErrors(interfs.size());
state std::vector<MutationAndVersionStream> streams(interfs.size());
@ -9907,10 +9970,11 @@ ACTOR Future<Void> mergeChangeFeedStream(Reference<DatabaseContext> db,
if (replyBufferSize != -1 && req.replyBufferSize < CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES) {
req.replyBufferSize = CLIENT_KNOBS->CHANGE_FEED_STREAM_MIN_BYTES;
}
req.debugUID = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.debugUID);
mergeCursorUID =
UID(mergeCursorUID.first() ^ req.debugUID.first(), mergeCursorUID.second() ^ req.debugUID.second());
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
debugUIDs.push_back(req.id);
mergeCursorUID = UID(mergeCursorUID.first() ^ req.id.first(), mergeCursorUID.second() ^ req.id.second());
results->streams.push_back(interfs[i].first.changeFeedStream.getReplyStream(req));
maybeDuplicateTSSChangeFeedStream(req,
@ -10113,7 +10177,8 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
Version* begin,
Version end,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state ChangeFeedStreamRequest req;
state Optional<ChangeFeedTSSValidationData> tssData;
@ -10123,10 +10188,11 @@ ACTOR Future<Void> singleChangeFeedStream(Reference<DatabaseContext> db,
req.range = range;
req.canReadPopped = canReadPopped;
req.replyBufferSize = replyBufferSize;
req.debugUID = deterministicRandom()->randomUniqueID();
req.options = readOptions;
req.id = deterministicRandom()->randomUniqueID();
if (DEBUG_CF_CLIENT_TRACE) {
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.debugUID)
TraceEvent(SevDebug, "TraceChangeFeedClientSingleCursor", req.id)
.detail("FeedID", rangeID)
.detail("Range", range)
.detail("Begin", *begin)
@ -10166,7 +10232,8 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
bool canReadPopped,
ReadOptions readOptions) {
state Database cx(db);
state Span span("NAPI:GetChangeFeedStream"_loc);
db->usedAnyChangeFeeds = true;
@ -10256,14 +10323,22 @@ ACTOR Future<Void> getChangeFeedStreamActor(Reference<DatabaseContext> db,
}
CODE_PROBE(true, "Change feed merge cursor");
// TODO (jslocum): validate connectionFileChanged behavior
wait(
mergeChangeFeedStream(db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
cx->connectionFileChanged());
wait(mergeChangeFeedStream(
db, interfs, results, rangeID, &begin, end, replyBufferSize, canReadPopped, readOptions) ||
cx->connectionFileChanged());
} else {
CODE_PROBE(true, "Change feed single cursor");
StorageServerInterface interf = locations[0].locations->getInterface(chosenLocations[0]);
wait(singleChangeFeedStream(
db, interf, range, results, rangeID, &begin, end, replyBufferSize, canReadPopped) ||
wait(singleChangeFeedStream(db,
interf,
range,
results,
rangeID,
&begin,
end,
replyBufferSize,
canReadPopped,
readOptions) ||
cx->connectionFileChanged());
}
} catch (Error& e) {
@ -10330,9 +10405,17 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Version end,
KeyRange range,
int replyBufferSize,
bool canReadPopped) {
return getChangeFeedStreamActor(
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
bool canReadPopped,
ReadOptions readOptions) {
return getChangeFeedStreamActor(Reference<DatabaseContext>::addRef(this),
results,
rangeID,
begin,
end,
range,
replyBufferSize,
canReadPopped,
readOptions);
}
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
@ -10564,6 +10647,34 @@ Reference<DatabaseContext::TransactionT> DatabaseContext::createTransaction() {
}
// BlobGranule API.
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Transaction* tr, KeyRange range, int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
loop {
state RangeResult results =
wait(krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
}
}
ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
KeyRange range,
Version purgeVersion,
@ -10573,7 +10684,6 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
state Transaction tr(cx);
state Key purgeKey;
state KeyRange purgeRange = range;
state bool loadedTenantPrefix = false;
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
if (purgeVersion == latestVersion) {
@ -10593,23 +10703,25 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
throw unsupported_operation();
}
if (tenant.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&cx, range.begin, tenant));
purgeRange = purgeRange.withPrefix(tme.prefix);
}
loop {
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
if (tenant.present() && !loadedTenantPrefix) {
TenantMapEntry tenantEntry = wait(blobGranuleGetTenantEntry(&tr, range.begin, tenant));
loadedTenantPrefix = true;
purgeRange = purgeRange.withPrefix(tenantEntry.prefix);
}
// must be aligned to blob range(s)
state Future<Optional<Value>> beginPresent = tr.get(purgeRange.begin.withPrefix(blobRangeKeys.begin));
state Future<Optional<Value>> endPresent = tr.get(purgeRange.end.withPrefix(blobRangeKeys.begin));
wait(success(beginPresent) && success(endPresent));
if (!beginPresent.get().present() || !endPresent.get().present()) {
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedBegin =
getBlobRanges(&tr, KeyRangeRef(purgeRange.begin, purgeRange.begin), 2);
state Future<Standalone<VectorRef<KeyRangeRef>>> blobbifiedEnd =
getBlobRanges(&tr, KeyRangeRef(purgeRange.end, purgeRange.end), 2);
wait(success(blobbifiedBegin) && success(blobbifiedEnd));
if ((!blobbifiedBegin.get().empty() && blobbifiedBegin.get().front().begin < purgeRange.begin) ||
(!blobbifiedEnd.get().empty() && blobbifiedEnd.get().back().end > purgeRange.end)) {
TraceEvent("UnalignedPurge")
.detail("Range", range)
.detail("Version", purgeVersion)
@ -10686,46 +10798,17 @@ Future<Void> DatabaseContext::waitPurgeGranulesComplete(Key purgeKey) {
return waitPurgeGranulesCompleteActor(Reference<DatabaseContext>::addRef(this), purgeKey);
}
ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> getBlobRanges(Reference<ReadYourWritesTransaction> tr,
KeyRange range,
int batchLimit) {
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
state Key beginKey = range.begin;
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state RangeResult results = wait(
krmGetRangesUnaligned(tr, blobRangeKeys.begin, KeyRangeRef(beginKey, range.end), 2 * batchLimit + 2));
blobRanges.arena().dependsOn(results.arena());
for (int i = 0; i < results.size() - 1; i++) {
if (results[i].value == blobRangeActive) {
blobRanges.push_back(blobRanges.arena(), KeyRangeRef(results[i].key, results[i + 1].key));
}
if (blobRanges.size() == batchLimit) {
return blobRanges;
}
}
if (!results.more) {
return blobRanges;
}
beginKey = results.back().key;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
KeyRange range,
bool active,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state bool loadedTenantEntry = false;
if (tenantName.present()) {
TenantMapEntry tme = wait(blobLoadTenantMapEntry(&db, range.begin, tenantName));
range = range.withPrefix(tme.prefix);
}
state Value value = active ? blobRangeActive : blobRangeInactive;
loop {
@ -10733,14 +10816,7 @@ ACTOR Future<bool> setBlobRangeActor(Reference<DatabaseContext> cx,
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
if (tenantName.present() && !loadedTenantEntry) {
TenantMapEntry tenantEntry =
wait(blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName));
loadedTenantEntry = true;
range = range.withPrefix(tenantEntry.prefix);
}
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(tr, range, 1));
Standalone<VectorRef<KeyRangeRef>> startBlobRanges = wait(getBlobRanges(&tr->getTransaction(), range, 1));
if (active) {
// Idempotent request.
@ -10788,23 +10864,26 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
KeyRange range,
int rangeLimit,
Optional<TenantName> tenantName) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
state Transaction tr(db);
state TenantMapEntry tme;
state Standalone<VectorRef<KeyRangeRef>> blobRanges;
if (tenantName.present()) {
wait(store(tme, blobLoadTenantMapEntry(&db, range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
loop {
try {
if (tenantName.present()) {
wait(store(tme, blobGranuleGetTenantEntry(&tr->getTransaction(), range.begin, tenantName)));
range = range.withPrefix(tme.prefix);
}
wait(store(blobRanges, getBlobRanges(&tr, range, rangeLimit)));
break;
} catch (Error& e) {
wait(tr->onError(e));
wait(tr.onError(e));
}
}
state Standalone<VectorRef<KeyRangeRef>> blobRanges = wait(getBlobRanges(tr, range, rangeLimit));
if (!tenantName.present()) {
return blobRanges;
}
@ -10826,9 +10905,9 @@ ACTOR Future<Standalone<VectorRef<KeyRangeRef>>> listBlobbifiedRangesActor(Refer
}
Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges(KeyRange range,
int rowLimit,
int rangeLimit,
Optional<TenantName> tenantName) {
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rowLimit, tenantName);
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
}
int64_t getMaxKeySize(KeyRef const& key) {

View File

@ -42,7 +42,7 @@ ACTOR static Future<Void> produce(ParallelStream<ParallelStreamTest::TestValue>:
}
ACTOR static Future<Void> consume(FutureStream<ParallelStreamTest::TestValue> stream, int expected) {
state int next;
state int next = 0;
try {
loop {
ParallelStreamTest::TestValue value = waitNext(stream);

View File

@ -564,6 +564,10 @@ Version PaxosConfigTransaction::getCommittedVersion() const {
return impl->getCommittedVersion();
}
int64_t PaxosConfigTransaction::getTotalCost() const {
return 0;
}
int64_t PaxosConfigTransaction::getApproximateSize() const {
return impl->getApproximateSize();
}

View File

@ -41,7 +41,7 @@
#include "flow/Hostname.h"
#include "flow/UnitTest.h"
#include "rapidxml/rapidxml.hpp"
#ifdef BUILD_AWS_BACKUP
#ifdef WITH_AWS_BACKUP
#include "fdbclient/FDBAWSCredentialsProvider.h"
#endif
@ -88,6 +88,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
concurrent_lists = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_LISTS;
concurrent_reads_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_READS_PER_FILE;
concurrent_writes_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
enable_read_cache = CLIENT_KNOBS->BLOBSTORE_ENABLE_READ_CACHE;
read_block_size = CLIENT_KNOBS->BLOBSTORE_READ_BLOCK_SIZE;
read_ahead_blocks = CLIENT_KNOBS->BLOBSTORE_READ_AHEAD_BLOCKS;
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
@ -125,6 +126,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(concurrent_lists, cl);
TRY_PARAM(concurrent_reads_per_file, crpf);
TRY_PARAM(concurrent_writes_per_file, cwpf);
TRY_PARAM(enable_read_cache, erc);
TRY_PARAM(read_block_size, rbs);
TRY_PARAM(read_ahead_blocks, rab);
TRY_PARAM(read_cache_blocks_per_file, rcb);
@ -162,6 +164,7 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
_CHECK_PARAM(concurrent_lists, cl);
_CHECK_PARAM(concurrent_reads_per_file, crpf);
_CHECK_PARAM(concurrent_writes_per_file, cwpf);
_CHECK_PARAM(enable_read_cache, erc);
_CHECK_PARAM(read_block_size, rbs);
_CHECK_PARAM(read_ahead_blocks, rab);
_CHECK_PARAM(read_cache_blocks_per_file, rcb);
@ -615,7 +618,7 @@ ACTOR Future<Optional<json_spirit::mObject>> tryReadJSONFile(std::string path) {
// If the credentials expire, the connection will eventually fail and be discarded from the pool, and then a new
// connection will be constructed, which will call this again to get updated credentials
static S3BlobStoreEndpoint::Credentials getSecretSdk() {
#ifdef BUILD_AWS_BACKUP
#ifdef WITH_AWS_BACKUP
double elapsed = -timer_monotonic();
Aws::Auth::AWSCredentials awsCreds = FDBAWSCredentialsProvider::getAwsCredentials();
elapsed += timer_monotonic();

View File

@ -115,6 +115,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENABLE_DETAILED_TLOG_POP_TRACE, false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true;
init( PEEK_BATCHING_EMPTY_MSG, false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true;
init( PEEK_BATCHING_EMPTY_MSG_INTERVAL, 0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01;
init( POP_FROM_LOG_DELAY, 1 ); if ( randomize && BUGGIFY ) POP_FROM_LOG_DELAY = 0;
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( MAX_FORKED_PROCESS_OUTPUT, 1024 );
@ -295,7 +296,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false );
init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -376,6 +377,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REPLACE_CONTENTS_BYTES, 1e5 );
// KeyValueStoreRocksDB
init( ROCKSDB_SET_READ_TIMEOUT, !isSimulated );
init( ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES, true ); if( randomize && BUGGIFY ) ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES = false;
init( ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE, true ); if( randomize && BUGGIFY ) ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE = false;
init( ROCKSDB_READ_RANGE_ROW_LIMIT, 65535 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ROW_LIMIT = deterministicRandom()->randomInt(2, 10);
@ -383,6 +385,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_WRITER_THREAD_PRIORITY, 0 );
init( ROCKSDB_BACKGROUND_PARALLELISM, 4 );
init( ROCKSDB_READ_PARALLELISM, 4 );
// If true, do not process and store RocksDB logs
init( ROCKSDB_MUTE_LOGS, false );
// Use a smaller memtable in simulation to avoid OOMs.
int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
init( ROCKSDB_MEMTABLE_BYTES, memtableBytes );
@ -404,7 +408,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_FETCH_QUEUE_SOFT_MAX, 50 );
init( ROCKSDB_HISTOGRAMS_SAMPLE_RATE, 0.001 ); if( randomize && BUGGIFY ) ROCKSDB_HISTOGRAMS_SAMPLE_RATE = 0;
init( ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME, 30.0 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME = 0.1;
init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip() ? true : false;
init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip();
init( ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS, false ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS = deterministicRandom()->coinflip();
init( ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT, 200 );
// Set to 0 to disable rocksdb write rate limiting. Rate limiter unit: bytes per second.
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
@ -412,7 +418,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY, "fdb");
init( ROCKSDB_DISABLE_AUTO_COMPACTIONS, false ); // RocksDB default
init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_PERFCONTEXT_ENABLE, false ); if( randomize && BUGGIFY ) ROCKSDB_PERFCONTEXT_ENABLE = deterministicRandom()->coinflip();
init( ROCKSDB_PERFCONTEXT_SAMPLE_RATE, 0.0001 );
init( ROCKSDB_METRICS_SAMPLE_INTERVAL, 0.0);
init( ROCKSDB_MAX_SUBCOMPACTIONS, 2 );
@ -422,10 +428,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Enable this knob only for experminatal purpose, never enable this in production.
// If enabled, all the committed in-memory memtable writes are lost on a crash.
init( ROCKSDB_DISABLE_WAL_EXPERIMENTAL, false );
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ENABLE_CLEAR_RANGE_EAGER_READS knob.
// If ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE is enabled, disable ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS knob.
// These knobs have contrary functionality.
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip() ? false : true;
init( ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE, false ); if( randomize && BUGGIFY ) ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE = deterministicRandom()->coinflip();
init( ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT, 200000 ); // 200KB
init( ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip();
// ROCKSDB_STATS_LEVEL=1 indicates rocksdb::StatsLevel::kExceptHistogramOrTimers
init( ROCKSDB_STATS_LEVEL, 1 ); if( randomize && BUGGIFY ) ROCKSDB_STATS_LEVEL = deterministicRandom()->randomInt(0, 6);
// Can commit will delay ROCKSDB_CAN_COMMIT_DELAY_ON_OVERLOAD seconds for
// ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD times, if rocksdb overloaded.
// Set ROCKSDB_CAN_COMMIT_DELAY_TIMES_ON_OVERLOAD to 0, to disable
@ -731,9 +740,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENFORCE_TAG_THROTTLING_ON_PROXIES, GLOBAL_TAG_THROTTLING );
init( GLOBAL_TAG_THROTTLING_MIN_RATE, 1.0 );
init( GLOBAL_TAG_THROTTLING_FOLDING_TIME, 10.0 );
init( GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO, 5.0 );
init( GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED, 10 );
init( GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER, 240.0 );
init( PROXY_MAX_TAG_THROTTLE_DURATION, 5.0 ); if( randomize && BUGGIFY ) PROXY_MAX_TAG_THROTTLE_DURATION = 0.5;
init( GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL, 60.0 );
//Storage Metrics
init( STORAGE_METRICS_AVERAGE_INTERVAL, 120.0 );
@ -761,8 +771,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_KEYS_PARALLELISM_FULL, 6 );
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( SERVE_AUDIT_STORAGE_PARALLELISM, 2 );
init( CHANGE_FEED_DISK_READS_PARALLELISM, 1000 ); if( randomize && BUGGIFY ) CHANGE_FEED_DISK_READS_PARALLELISM = 20;
init( SERVE_AUDIT_STORAGE_PARALLELISM, 1 );
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
init( STORAGE_COMMIT_BYTES, 10000000 ); if( randomize && BUGGIFY ) STORAGE_COMMIT_BYTES = 2000000;
@ -801,6 +810,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -912,7 +925,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_DEFAULT_EXTENT_SIZE, 32 * 1024 * 1024 );
init( REDWOOD_DEFAULT_EXTENT_READ_SIZE, 1024 * 1024 );
init( REDWOOD_EXTENT_CONCURRENT_READS, 4 );
init( REDWOOD_KVSTORE_CONCURRENT_READS, 64 );
init( REDWOOD_KVSTORE_RANGE_PREFETCH, true );
init( REDWOOD_PAGE_REBUILD_MAX_SLACK, 0.33 );
init( REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES, 10 );
@ -925,6 +937,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
@ -939,9 +952,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENCRYPTION_MODE, "AES-256-CTR" );
init( SIM_KMS_MAX_KEYS, 4096 );
init( ENCRYPT_PROXY_MAX_DBG_TRACE_LENGTH, 100000 );
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION && !PROXY_USE_RESOLVER_PRIVATE_MUTATIONS ) ENABLE_TLOG_ENCRYPTION = true;
init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_STORAGE_SERVER_ENCRYPTION = !ENABLE_STORAGE_SERVER_ENCRYPTION;
init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY) ENABLE_BLOB_GRANULE_ENCRYPTION = !ENABLE_BLOB_GRANULE_ENCRYPTION;
init( ENABLE_TLOG_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION ) ENABLE_TLOG_ENCRYPTION = false;
init( ENABLE_STORAGE_SERVER_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_STORAGE_SERVER_ENCRYPTION = false;
init( ENABLE_BLOB_GRANULE_ENCRYPTION, ENABLE_ENCRYPTION ); if ( randomize && BUGGIFY && ENABLE_ENCRYPTION) ENABLE_BLOB_GRANULE_ENCRYPTION = false;
// encrypt key proxy
init( ENABLE_BLOB_GRANULE_COMPRESSION, false ); if ( randomize && BUGGIFY ) { ENABLE_BLOB_GRANULE_COMPRESSION = deterministicRandom()->coinflip(); }
@ -951,7 +964,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" );
// Blob granlues
init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually
init( BG_URL, isSimulated ? "file://simfdb/fdbblob/" : "" ); // TODO: store in system key space or something, eventually
bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
init( BG_METADATA_SOURCE, "knobs" );
@ -967,6 +980,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 );
init( BG_ENABLE_READ_DRIVEN_COMPACTION, true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
@ -975,6 +991,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
init( BLOB_WORKER_RDC_PARALLELISM, 2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 );
@ -990,6 +1008,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_BACKUP, false );
init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 );
init( BLOB_FULL_RESTORE_MODE, false );
init( BLOB_MIGRATOR_CHECK_INTERVAL, isSimulated ? 1.0 : 5.0);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );
@ -997,8 +1016,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// Blob Metadata
init( BLOB_METADATA_CACHE_TTL, isSimulated ? 120 : 24 * 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_CACHE_TTL = deterministicRandom()->randomInt(50, 100); }
init( BLOB_METADATA_REFRESH_INTERVAL, isSimulated ? 60 : 60 * 60 );
if ( randomize && BUGGIFY) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
// HTTP KMS Connector
init( REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE, "file");
@ -1019,6 +1036,10 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// NOTE: 'token-name" can NOT contain '#' character
init( REST_KMS_CONNECTOR_VALIDATION_TOKEN_DETAILS, "");
// Drop in-memory state associated with an idempotency id after this many seconds. Once dropped, this id cannot be
// expired proactively, but will eventually get cleaned up by the idempotency id cleaner.
init( IDEMPOTENCY_ID_IN_MEMORY_LIFETIME, 10);
// clang-format on
if (clientKnobs) {

View File

@ -296,6 +296,10 @@ Version SimpleConfigTransaction::getCommittedVersion() const {
return impl->getCommittedVersion();
}
int64_t SimpleConfigTransaction::getTotalCost() const {
return 0;
}
int64_t SimpleConfigTransaction::getApproximateSize() const {
return impl->getApproximateSize();
}

View File

@ -284,8 +284,6 @@ const KeyRangeRef readConflictRangeKeysRange =
const KeyRangeRef writeConflictRangeKeysRange = KeyRangeRef("\xff\xff/transaction/write_conflict_range/"_sr,
"\xff\xff/transaction/write_conflict_range/\xff\xff"_sr);
const KeyRef clusterIdKey = "\xff/clusterId"_sr;
const KeyRangeRef auditRange = KeyRangeRef("\xff/audit/"_sr, "\xff/audit0"_sr);
const KeyRef auditPrefix = auditRange.begin;
@ -1074,6 +1072,11 @@ const KeyRangeRef timeKeeperPrefixRange("\xff\x02/timeKeeper/map/"_sr, "\xff\x02
const KeyRef timeKeeperVersionKey = "\xff\x02/timeKeeper/version"_sr;
const KeyRef timeKeeperDisableKey = "\xff\x02/timeKeeper/disable"_sr;
// Durable cluster ID key. Added "Key" to the end to differentiate from the key
// "\xff/clusterId" which was stored in the txnStateStore in FDB 7.1, whereas
// this key is stored in the database in 7.2+.
const KeyRef clusterIdKey = "\xff/clusterIdKey"_sr;
// Backup Log Mutation constant variables
const KeyRef backupEnabledKey = "\xff/backupEnabled"_sr;
const KeyRangeRef backupLogKeys("\xff\x02/blog/"_sr, "\xff\x02/blog0"_sr);
@ -1810,4 +1813,4 @@ TEST_CASE("noSim/SystemData/compat/KeyServers") {
printf("ssi serdes test complete\n");
return Void();
}
}

View File

@ -626,6 +626,14 @@ ThreadFuture<SpanContext> ThreadSafeTransaction::getSpanContext() {
});
}
ThreadFuture<int64_t> ThreadSafeTransaction::getTotalCost() {
ISingleThreadTransaction* tr = this->tr;
return onMainThread([tr]() -> Future<int64_t> {
tr->checkDeferredError();
return tr->getTotalCost();
});
}
ThreadFuture<int64_t> ThreadSafeTransaction::getApproximateSize() {
ISingleThreadTransaction* tr = this->tr;
return onMainThread([tr]() -> Future<int64_t> {
@ -735,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
Optional<Error> runErr;
try {
::runNetwork();
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e;
} catch (std::exception& e) {
} catch (const std::exception& e) {
runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
@ -749,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
for (auto& hook : threadCompletionHooks) {
try {
hook.first(hook.second);
} catch (Error& e) {
} catch (const Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) {
} catch (const std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@ -759,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
}
if (runErr.present()) {
closeTraceFile();
throw runErr.get();
}
TraceEvent("RunNetworkTerminating");
closeTraceFile();
}
void ThreadSafeApi::stopNetwork() {

View File

@ -196,6 +196,7 @@ public:
Key addPrefix = Key(),
Key removePrefix = Key(),
LockDB = LockDB::True,
UnlockDB = UnlockDB::True,
OnlyApplyMutationLogs = OnlyApplyMutationLogs::False,
InconsistentSnapshotOnly = InconsistentSnapshotOnly::False,
Version beginVersion = ::invalidVersion,

View File

@ -103,6 +103,7 @@ public:
Counter latestCipherKeyCacheNeedsRefresh;
LatencySample getCipherKeysLatency;
LatencySample getLatestCipherKeysLatency;
LatencySample getBlobMetadataLatency;
std::array<CounterSet, int(UsageType::MAX)> counterSets;
};

View File

@ -91,4 +91,8 @@ struct BlobMetadataDetailsRef {
}
};
Standalone<BlobMetadataDetailsRef> createRandomTestBlobMetadata(const std::string& baseUrl,
BlobMetadataDomainId domainId,
BlobMetadataDomainName domainName);
#endif

View File

@ -45,6 +45,7 @@ struct BlobWorkerStats {
Counter compressionBytesFinal;
Counter fullRejections;
Counter forceFlushCleanups;
Counter readDrivenCompactions;
int numRangesAssigned;
int mutationBytesBuffered;
@ -83,10 +84,11 @@ struct BlobWorkerStats {
readRequestsWithBegin("ReadRequestsWithBegin", cc), readRequestsCollapsed("ReadRequestsCollapsed", cc),
flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
forceFlushCleanups("ForceFlushCleanups", cc), numRangesAssigned(0), mutationBytesBuffered(0),
activeReadRequests(0), granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0),
notAtLatestChangeFeeds(0), lastResidentMemory(0), estimatedMaxResidentMemory(0),
initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock), deltaWritesLock(deltaWritesLock) {
forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
deltaWritesLock(deltaWritesLock) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });

View File

@ -22,7 +22,7 @@
#define FDBCLIENT_BUILD_IDEMPOTENCY_ID_MUTATIONS_H
#include "fdbclient/CommitProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#pragma once

View File

@ -199,7 +199,6 @@ public:
int32_t DEFAULT_MAX_GRV_PROXIES;
int32_t DEFAULT_AUTO_RESOLVERS;
int32_t DEFAULT_AUTO_LOGS;
bool DELETE_NATIVE_LIB_AFTER_LOADING;
double GLOBAL_CONFIG_REFRESH_BACKOFF;
double GLOBAL_CONFIG_REFRESH_MAX_BACKOFF;
@ -235,6 +234,7 @@ public:
int BLOBSTORE_CONCURRENT_LISTS;
int BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
int BLOBSTORE_CONCURRENT_READS_PER_FILE;
int BLOBSTORE_ENABLE_READ_CACHE;
int BLOBSTORE_READ_BLOCK_SIZE;
int BLOBSTORE_READ_AHEAD_BLOCKS;
int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
@ -262,8 +262,8 @@ public:
double TAG_THROTTLE_EXPIRATION_INTERVAL;
int64_t WRITE_COST_BYTE_FACTOR; // Used to round up the cost of write operations
int64_t READ_COST_BYTE_FACTOR; // Used to round up the cost of read operations
double PROXY_MAX_TAG_THROTTLE_DURATION; // Maximum duration that a transaction can be tag throttled by proxy before
// being rejected
// Cost multiplier for writes (because write operations are more expensive than reads):
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
// busyness reporting
double BUSYNESS_SPIKE_START_THRESHOLD;
@ -272,6 +272,7 @@ public:
// Blob Granules
int BG_MAX_GRANULE_PARALLELISM;
int BG_TOO_MANY_GRANULES;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// The coordinator key/value in storage server might be inconsistent to the value stored in the cluster file.
// This might happen when a recovery is happening together with a cluster controller coordinator key change.

View File

@ -30,7 +30,7 @@
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GlobalConfig.h"
#include "fdbclient/GrvProxyInterface.h"
#include "fdbclient/IdempotencyId.h"
#include "fdbclient/IdempotencyId.actor.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/TagThrottle.actor.h"
#include "fdbclient/VersionVector.h"
@ -61,6 +61,7 @@ struct CommitProxyInterface {
RequestStream<struct ProxySnapRequest> proxySnapReq;
RequestStream<struct ExclusionSafetyCheckRequest> exclusionSafetyCheckReq;
RequestStream<struct GetDDMetricsRequest> getDDMetrics;
PublicRequestStream<struct ExpireIdempotencyIdRequest> expireIdempotencyId;
UID id() const { return commit.getEndpoint().token; }
std::string toString() const { return id().shortString(); }
@ -87,6 +88,8 @@ struct CommitProxyInterface {
exclusionSafetyCheckReq =
RequestStream<struct ExclusionSafetyCheckRequest>(commit.getEndpoint().getAdjustedEndpoint(8));
getDDMetrics = RequestStream<struct GetDDMetricsRequest>(commit.getEndpoint().getAdjustedEndpoint(9));
expireIdempotencyId =
PublicRequestStream<struct ExpireIdempotencyIdRequest>(commit.getEndpoint().getAdjustedEndpoint(10));
}
}
@ -103,6 +106,7 @@ struct CommitProxyInterface {
streams.push_back(proxySnapReq.getReceiver());
streams.push_back(exclusionSafetyCheckReq.getReceiver());
streams.push_back(getDDMetrics.getReceiver());
streams.push_back(expireIdempotencyId.getReceiver());
FlowTransport::transport().addEndpoints(streams);
}
};
@ -151,6 +155,24 @@ struct ClientDBInfo {
}
};
struct ExpireIdempotencyIdRequest {
constexpr static FileIdentifier file_identifier = 1900933;
Version commitVersion = invalidVersion;
uint8_t batchIndexHighByte = 0;
TenantInfo tenant;
ExpireIdempotencyIdRequest() {}
ExpireIdempotencyIdRequest(Version commitVersion, uint8_t batchIndexHighByte, TenantInfo tenant)
: commitVersion(commitVersion), batchIndexHighByte(batchIndexHighByte), tenant(tenant) {}
bool verify() const { return tenant.isAuthorized(); }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, commitVersion, batchIndexHighByte, tenant);
}
};
struct CommitID {
constexpr static FileIdentifier file_identifier = 14254927;
Version version; // returns invalidVersion if transaction conflicts

View File

@ -382,7 +382,8 @@ public:
Version end = std::numeric_limits<Version>::max(),
KeyRange range = allKeys,
int replyBufferSize = -1,
bool canReadPopped = true);
bool canReadPopped = true,
ReadOptions readOptions = { ReadType::NORMAL, CacheResult::False });
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);

View File

@ -546,36 +546,37 @@ struct hash<KeyRange> {
enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() };
inline Key keyAfter(const KeyRef& key) {
if (key == "\xff\xff"_sr)
return key;
Standalone<StringRef> r;
uint8_t* s = new (r.arena()) uint8_t[key.size() + 1];
if (key.size() > 0) {
memcpy(s, key.begin(), key.size());
}
s[key.size()] = 0;
((StringRef&)r) = StringRef(s, key.size() + 1);
return r;
}
inline KeyRef keyAfter(const KeyRef& key, Arena& arena) {
if (key == "\xff\xff"_sr)
return key;
// Don't include fdbclient/SystemData.h for the allKeys symbol to avoid a cyclic include
static const auto allKeysEnd = "\xff\xff"_sr;
if (key == allKeysEnd) {
return allKeysEnd;
}
uint8_t* t = new (arena) uint8_t[key.size() + 1];
memcpy(t, key.begin(), key.size());
if (!key.empty()) {
memcpy(t, key.begin(), key.size());
}
t[key.size()] = 0;
return KeyRef(t, key.size() + 1);
}
inline KeyRange singleKeyRange(const KeyRef& a) {
return KeyRangeRef(a, keyAfter(a));
inline Key keyAfter(const KeyRef& key) {
Key result;
result.contents() = keyAfter(key, result.arena());
return result;
}
inline KeyRangeRef singleKeyRange(KeyRef const& key, Arena& arena) {
uint8_t* t = new (arena) uint8_t[key.size() + 1];
memcpy(t, key.begin(), key.size());
if (!key.empty()) {
memcpy(t, key.begin(), key.size());
}
t[key.size()] = 0;
return KeyRangeRef(KeyRef(t, key.size()), KeyRef(t, key.size() + 1));
}
inline KeyRange singleKeyRange(const KeyRef& a) {
KeyRange result;
result.contents() = singleKeyRange(a, result.arena());
return result;
}
inline KeyRange prefixRange(KeyRef prefix) {
Standalone<KeyRangeRef> range;
KeyRef start = KeyRef(range.arena(), prefix);
@ -1494,7 +1495,7 @@ struct EncryptionAtRestMode {
bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); }
bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); }
static EncryptionAtRestMode fromValue(Optional<ValueRef> val) {
static EncryptionAtRestMode fromValueRef(Optional<ValueRef> val) {
if (!val.present()) {
return DISABLED;
}
@ -1508,6 +1509,14 @@ struct EncryptionAtRestMode {
return static_cast<Mode>(num);
}
static EncryptionAtRestMode fromValue(Optional<Value> val) {
if (!val.present()) {
return EncryptionAtRestMode();
}
return EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(val.get().contents()));
}
uint32_t mode;
};
@ -1635,13 +1644,7 @@ struct StorageWiggleValue {
}
};
enum class ReadType {
EAGER,
FETCH,
LOW,
NORMAL,
HIGH,
};
enum class ReadType { EAGER = 0, FETCH = 1, LOW = 2, NORMAL = 3, HIGH = 4, MIN = EAGER, MAX = HIGH };
FDB_DECLARE_BOOLEAN_PARAM(CacheResult);
@ -1657,14 +1660,14 @@ struct ReadOptions {
Optional<UID> debugID;
Optional<Version> consistencyCheckStartVersion;
ReadOptions() : type(ReadType::NORMAL), cacheResult(CacheResult::True){};
ReadOptions(Optional<UID> debugID,
ReadOptions(Optional<UID> debugID = Optional<UID>(),
ReadType type = ReadType::NORMAL,
CacheResult cache = CacheResult::False,
CacheResult cache = CacheResult::True,
Optional<Version> version = Optional<Version>())
: type(type), cacheResult(cache), debugID(debugID), consistencyCheckStartVersion(version){};
ReadOptions(ReadType type, CacheResult cache = CacheResult::True) : ReadOptions({}, type, cache) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, type, cacheResult, debugID, consistencyCheckStartVersion);

View File

@ -284,7 +284,6 @@ Future<ConfigurationResult> changeConfig(Reference<DB> db, std::map<std::string,
state Key versionKey = BinaryWriter::toValue(deterministicRandom()->randomUniqueID(), Unversioned());
state bool oldReplicationUsesDcId = false;
state bool warnPPWGradual = false;
state bool warnChangeStorageNoMigrate = false;
state bool warnRocksDBIsExperimental = false;
state bool warnShardedRocksDBIsExperimental = false;
loop {

View File

@ -120,6 +120,7 @@ public:
// later if they are not really needed.
virtual ThreadFuture<VersionVector> getVersionVector() = 0;
virtual ThreadFuture<SpanContext> getSpanContext() = 0;
virtual ThreadFuture<int64_t> getTotalCost() = 0;
virtual ThreadFuture<int64_t> getApproximateSize() = 0;
virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;

View File

@ -101,6 +101,7 @@ public:
virtual Version getCommittedVersion() const = 0;
virtual VersionVector getVersionVector() const = 0;
virtual SpanContext getSpanContext() const = 0;
virtual int64_t getTotalCost() const = 0;
virtual int64_t getApproximateSize() const = 0;
virtual Future<Standalone<StringRef>> getVersionstamp() = 0;
virtual void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) = 0;

View File

@ -1,5 +1,5 @@
/*
* IdempotencyId.h
* IdempotencyId.actor.h
*
* This source file is part of the FoundationDB open source project
*
@ -18,8 +18,13 @@
* limitations under the License.
*/
#ifndef FDBCLIENT_IDEMPOTENCYID_H
#define FDBCLIENT_IDEMPOTENCYID_H
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_G_H
#include "fdbclient/IdempotencyId.actor.g.h"
#elif !defined(FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H)
#define FDBCLIENT_IDEMPOTENCY_ID_ACTOR_H
#pragma once
@ -28,12 +33,24 @@
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/serialize.h"
#include "flow/actorcompiler.h" // this has to be the last include
struct CommitResult {
Version commitVersion;
uint16_t batchIndex;
};
// The type of the value stored at the key |idempotencyIdsExpiredVersion|
struct IdempotencyIdsExpiredVersion {
static constexpr auto file_identifier = 3746945;
Version expired = 0;
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, expired);
}
};
// See design/idempotency_ids.md for more information. Designed so that the common case of a random 16 byte id does not
// usually require indirection. Either invalid or an id with length >= 16 and < 256.
struct IdempotencyIdRef {
@ -163,4 +180,10 @@ private:
// Check if id is present in kv, and if so return the commit version and batchIndex
Optional<CommitResult> kvContainsIdempotencyId(const KeyValueRef& kv, const IdempotencyIdRef& id);
#endif
// Make a range containing only the idempotency key associated with version and highOrderBatchIndex
KeyRangeRef makeIdempotencySingleKeyRange(Arena& arena, Version version, uint8_t highOrderBatchIndex);
void decodeIdempotencyKey(KeyRef key, Version& commitVersion, uint8_t& highOrderBatchIndex);
#include "flow/unactorcompiler.h"
#endif

View File

@ -377,6 +377,7 @@ struct FdbCApi : public ThreadSafeReferenceCounted<FdbCApi> {
FDBFuture* (*transactionCommit)(FDBTransaction* tr);
fdb_error_t (*transactionGetCommittedVersion)(FDBTransaction* tr, int64_t* outVersion);
FDBFuture* (*transactionGetTotalCost)(FDBTransaction* tr);
FDBFuture* (*transactionGetApproximateSize)(FDBTransaction* tr);
FDBFuture* (*transactionWatch)(FDBTransaction* tr, uint8_t const* keyName, int keyNameLength);
FDBFuture* (*transactionOnError)(FDBTransaction* tr, fdb_error_t error);
@ -505,6 +506,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override { return SpanContext(); };
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -732,6 +734,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override;
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
@ -1024,6 +1027,7 @@ public:
ThreadFuture<Void> protocolVersionMonitor;
Future<Void> sharedStateUpdater;
bool isConfigDB;
// Versions older than 6.1 do not benefit from having their database connections closed. Additionally,
// there are various issues that result in negative behavior in some cases if the connections are closed.
@ -1147,6 +1151,7 @@ private:
bool disableBypass;
volatile bool bypassMultiClientApi;
volatile bool externalClient;
bool retainClientLibCopies;
ApiVersion apiVersion;
int nextThread = 0;

View File

@ -249,6 +249,9 @@ struct TransactionState : ReferenceCounted<TransactionState> {
SpanContext spanContext;
UseProvisionalProxies useProvisionalProxies = UseProvisionalProxies::False;
bool readVersionObtainedFromGrvProxy;
// Measured by summing the bytes accessed by each read and write operation
// after rounding up to the nearest page size and applying a write penalty
int64_t totalCost = 0;
// Special flag to skip prepending tenant prefix to mutations and conflict ranges
// when a dummy, internal transaction gets commited. The sole purpose of commitDummyTransaction() is to
@ -268,6 +271,8 @@ struct TransactionState : ReferenceCounted<TransactionState> {
// prefix/<key2> : '0' - any keys equal or larger than this key are (definitely) not conflicting keys
std::shared_ptr<CoalescedKeyRangeMap<Value>> conflictingKeys;
bool automaticIdempotency = false;
// Only available so that Transaction can have a default constructor, for use in state variables
TransactionState(TaskPriority taskID, SpanContext spanContext)
: taskID(taskID), spanContext(spanContext), tenantSet(false) {}
@ -445,6 +450,8 @@ public:
// May be called only after commit() returns success
Version getCommittedVersion() const { return trState->committedVersion; }
int64_t getTotalCost() const { return trState->totalCost; }
// Will be fulfilled only after commit() returns success
[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp();
@ -482,6 +489,7 @@ public:
Database getDatabase() const { return trState->cx; }
static Reference<TransactionLogInfo> createTrLogInfoProbabilistically(const Database& cx);
Transaction& getTransaction() { return *this; }
void setTransactionID(UID id);
void setToken(uint64_t token);
@ -563,9 +571,16 @@ ACTOR Future<std::vector<CheckpointMetaData>> getCheckpointMetaData(Database cx,
// Checks with Data Distributor that it is safe to mark all servers in exclusions as failed
ACTOR Future<bool> checkSafeExclusions(Database cx, std::vector<AddressExclusion> exclusions);
// Round up to the nearest page size
// Measured in bytes, rounded up to the nearest page size. Multiply by fungibility ratio
// because writes are more expensive than reads.
inline uint64_t getWriteOperationCost(uint64_t bytes) {
return (bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1;
return CLIENT_KNOBS->GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO * CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR *
((bytes - 1) / CLIENT_KNOBS->WRITE_COST_BYTE_FACTOR + 1);
}
// Measured in bytes, rounded up to the nearest page size.
inline uint64_t getReadOperationCost(uint64_t bytes) {
return ((bytes - 1) / CLIENT_KNOBS->READ_COST_BYTE_FACTOR + 1) * CLIENT_KNOBS->READ_COST_BYTE_FACTOR;
}
// Create a transaction to set the value of system key \xff/conf/perpetual_storage_wiggle. If enable == true, the value

View File

@ -64,6 +64,7 @@ public:
void clear(KeyRef const&) override;
Future<Void> commit() override;
Version getCommittedVersion() const override;
int64_t getTotalCost() const override;
int64_t getApproximateSize() const override;
void setOption(FDBTransactionOptions::Option option, Optional<StringRef> value = Optional<StringRef>()) override;
Future<Void> onError(Error const& e) override;

View File

@ -149,6 +149,7 @@ public:
VersionVector getVersionVector() const override { return tr.getVersionVector(); }
SpanContext getSpanContext() const override { return tr.getSpanContext(); }
int64_t getTotalCost() const override { return tr.getTotalCost(); }
int64_t getApproximateSize() const override { return approximateSize; }
[[nodiscard]] Future<Standalone<StringRef>> getVersionstamp() override;

View File

@ -58,8 +58,8 @@ public:
requests_per_second, list_requests_per_second, write_requests_per_second, read_requests_per_second,
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second,
max_recv_bytes_per_second, sdk_auth;
enable_read_cache, read_block_size, read_ahead_blocks, read_cache_blocks_per_file,
max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth;
bool set(StringRef name, int value);
std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() {
@ -86,6 +86,7 @@ public:
"concurrent_lists (or cl) Max concurrent list operations that can be in progress at once.",
"concurrent_reads_per_file (or crps) Max concurrent reads in progress for any one file.",
"concurrent_writes_per_file (or cwps) Max concurrent uploads in progress for any one file.",
"enable_read_cache (or erc) Whether read block caching is enabled.",
"read_block_size (or rbs) Block size in bytes to be used for reads.",
"read_ahead_blocks (or rab) Number of blocks to read ahead of requested offset.",
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",

View File

@ -110,6 +110,7 @@ public:
double BLOCKING_PEEK_TIMEOUT;
bool PEEK_BATCHING_EMPTY_MSG;
double PEEK_BATCHING_EMPTY_MSG_INTERVAL;
double POP_FROM_LOG_DELAY;
// Data distribution queue
double HEALTH_POLL_TIME;
@ -306,16 +307,18 @@ public:
int64_t REPLACE_CONTENTS_BYTES;
// KeyValueStoreRocksDB
int ROCKSDB_READER_THREAD_PRIORITY;
int ROCKSDB_WRITER_THREAD_PRIORITY;
bool ROCKSDB_SET_READ_TIMEOUT;
bool ROCKSDB_LEVEL_COMPACTION_DYNAMIC_LEVEL_BYTES;
int ROCKSDB_SUGGEST_COMPACT_CLEAR_RANGE;
int ROCKSDB_READ_RANGE_ROW_LIMIT;
int ROCKSDB_READER_THREAD_PRIORITY;
int ROCKSDB_WRITER_THREAD_PRIORITY;
int ROCKSDB_BACKGROUND_PARALLELISM;
int ROCKSDB_READ_PARALLELISM;
int64_t ROCKSDB_MEMTABLE_BYTES;
bool ROCKSDB_LEVEL_STYLE_COMPACTION;
bool ROCKSDB_UNSAFE_AUTO_FSYNC;
bool ROCKSDB_MUTE_LOGS;
int64_t ROCKSDB_PERIODIC_COMPACTION_SECONDS;
int ROCKSDB_PREFIX_LEN;
int64_t ROCKSDB_BLOCK_CACHE_SIZE;
@ -333,6 +336,8 @@ public:
double ROCKSDB_HISTOGRAMS_SAMPLE_RATE;
double ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME;
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
bool ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS;
int ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
@ -349,6 +354,8 @@ public:
bool ROCKSDB_DISABLE_WAL_EXPERIMENTAL;
bool ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE;
int64_t ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT;
bool ROCKSDB_ENABLE_CLEAR_RANGE_EAGER_READS;
int ROCKSDB_STATS_LEVEL;
int64_t ROCKSDB_COMPACTION_READAHEAD_SIZE;
int64_t ROCKSDB_BLOCK_SIZE;
bool ENABLE_SHARDED_ROCKSDB;
@ -628,14 +635,16 @@ public:
double GLOBAL_TAG_THROTTLING_MIN_RATE;
// Used by global tag throttling counters
double GLOBAL_TAG_THROTTLING_FOLDING_TIME;
// Cost multiplier for writes (because write operations are more expensive than reads)
double GLOBAL_TAG_THROTTLING_RW_FUNGIBILITY_RATIO;
// Maximum number of tags tracked by global tag throttler. Additional tags will be ignored
// until some existing tags expire
int64_t GLOBAL_TAG_THROTTLING_MAX_TAGS_TRACKED;
// Global tag throttler forgets about throughput from a tag once no new transactions from that
// tag have been received for this duration (in seconds):
int64_t GLOBAL_TAG_THROTTLING_TAG_EXPIRE_AFTER;
// Maximum duration that a transaction can be tag throttled by proxy before being rejected
double PROXY_MAX_TAG_THROTTLE_DURATION;
// Interval at which latency bands are logged for each tag on grv proxy
double GLOBAL_TAG_THROTTLING_PROXY_LOGGING_INTERVAL;
double MAX_TRANSACTIONS_PER_BYTE;
@ -717,7 +726,6 @@ public:
int FETCH_KEYS_LOWER_PRIORITY;
int SERVE_FETCH_CHECKPOINT_PARALLELISM;
int SERVE_AUDIT_STORAGE_PARALLELISM;
int CHANGE_FEED_DISK_READS_PARALLELISM;
int BUGGIFY_BLOCK_BYTES;
int64_t STORAGE_RECOVERY_VERSION_LAG_LIMIT;
double STORAGE_DURABILITY_LAG_REJECT_THRESHOLD;
@ -740,7 +748,6 @@ public:
int64_t MIN_TAG_READ_PAGES_RATE;
int64_t MIN_TAG_WRITE_PAGES_RATE;
double TAG_MEASUREMENT_INTERVAL;
int64_t READ_COST_BYTE_FACTOR;
bool PREFIX_COMPRESS_KVS_MEM_SNAPSHOTS;
bool REPORT_DD_METRICS;
double DD_METRICS_REPORT_INTERVAL;
@ -757,6 +764,9 @@ public:
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -886,7 +896,6 @@ public:
int REDWOOD_DEFAULT_EXTENT_SIZE; // Extent size for new Redwood files
int REDWOOD_DEFAULT_EXTENT_READ_SIZE; // Extent read size for Redwood files
int REDWOOD_EXTENT_CONCURRENT_READS; // Max number of simultaneous extent disk reads in progress.
int REDWOOD_KVSTORE_CONCURRENT_READS; // Max number of simultaneous point or range reads in progress.
bool REDWOOD_KVSTORE_RANGE_PREFETCH; // Whether to use range read prefetching
double REDWOOD_PAGE_REBUILD_MAX_SLACK; // When rebuilding pages, max slack to allow in page
int REDWOOD_LAZY_CLEAR_BATCH_SIZE_PAGES; // Number of pages to try to pop from the lazy delete queue and process at
@ -906,6 +915,8 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;
double LATENCY_METRICS_LOGGING_INTERVAL;
@ -950,10 +961,14 @@ public:
int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
int BG_MERGE_CANDIDATE_DELAY_SECONDS;
int BG_KEY_TUPLE_TRUNCATE_OFFSET;
bool BG_ENABLE_READ_DRIVEN_COMPACTION;
int BG_RDC_BYTES_FACTOR;
int BG_RDC_READ_FACTOR;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
int BLOB_WORKER_RDC_PARALLELISM;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
@ -972,10 +987,10 @@ public:
bool BLOB_MANIFEST_BACKUP;
double BLOB_MANIFEST_BACKUP_INTERVAL;
bool BLOB_FULL_RESTORE_MODE;
double BLOB_MIGRATOR_CHECK_INTERVAL;
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;
int64_t BLOB_METADATA_REFRESH_INTERVAL;
// HTTP KMS Connector
std::string REST_KMS_CONNECTOR_KMS_DISCOVERY_URL_MODE;
@ -989,6 +1004,9 @@ public:
std::string REST_KMS_CONNECTOR_GET_ENCRYPTION_KEYS_ENDPOINT;
std::string REST_KMS_CONNECTOR_GET_BLOB_METADATA_ENDPOINT;
// Idempotency ids
double IDEMPOTENCY_ID_IN_MEMORY_LIFETIME;
ServerKnobs(Randomize, ClientKnobs*, IsSimulated);
void initialize(Randomize, ClientKnobs*, IsSimulated);
};

View File

@ -76,6 +76,7 @@ public:
void reset() override;
void debugTransaction(UID dID) override;
void checkDeferredError() const override;
int64_t getTotalCost() const override;
int64_t getApproximateSize() const override;
void set(KeyRef const&, ValueRef const&) override;
void clear(KeyRangeRef const&) override { throw client_invalid_operation(); }

View File

@ -45,7 +45,7 @@ struct CheckpointMetaData {
constexpr static FileIdentifier file_identifier = 13804342;
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int16_t format; // CheckpointFormat.
UID ssID; // Storage server ID on which this checkpoint is created.
UID checkpointID; // A unique id for this checkpoint.
@ -58,11 +58,15 @@ struct CheckpointMetaData {
CheckpointMetaData() = default;
CheckpointMetaData(KeyRange const& range, CheckpointFormat format, UID const& ssID, UID const& checkpointID)
: version(invalidVersion), range(range), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(invalidVersion), format(format), ssID(ssID), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {
this->ranges.push_back(range);
}
CheckpointMetaData(Version version, KeyRange const& range, CheckpointFormat format, UID checkpointID)
: version(version), range(range), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending),
referenceCount(0), gcTime(0) {}
: version(version), format(format), ssID(UID()), checkpointID(checkpointID), state(Pending), referenceCount(0),
gcTime(0) {
this->ranges.push_back(range);
}
CheckpointState getState() const { return static_cast<CheckpointState>(state); }
@ -73,7 +77,7 @@ struct CheckpointMetaData {
void setFormat(CheckpointFormat format) { this->format = static_cast<int16_t>(format); }
std::string toString() const {
std::string res = "Checkpoint MetaData:\nRange: " + range.toString() + "\nVersion: " + std::to_string(version) +
std::string res = "Checkpoint MetaData:\nRange: " + describe(ranges) + "\nVersion: " + std::to_string(version) +
"\nFormat: " + std::to_string(format) + "\nServer: " + ssID.toString() +
"\nID: " + checkpointID.toString() + "\nState: " + std::to_string(static_cast<int>(state)) +
"\n";
@ -82,7 +86,7 @@ struct CheckpointMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, range, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
serializer(ar, version, ranges, format, state, checkpointID, ssID, gcTime, serializedCheckpoint);
}
};
@ -99,23 +103,28 @@ struct DataMoveMetaData {
constexpr static FileIdentifier file_identifier = 13804362;
UID id; // A unique id for this data move.
Version version;
KeyRange range;
std::vector<KeyRange> ranges;
int priority;
std::set<UID> src;
std::set<UID> dest;
std::set<UID> checkpoints;
int16_t phase; // DataMoveMetaData::Phase.
int8_t mode;
DataMoveMetaData() = default;
DataMoveMetaData(UID id, Version version, KeyRange range)
: id(id), version(version), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), range(std::move(range)), priority(0) {}
DataMoveMetaData(UID id, Version version, KeyRange range) : id(id), version(version), priority(0), mode(0) {
this->ranges.push_back(range);
}
DataMoveMetaData(UID id, KeyRange range) : id(id), version(invalidVersion), priority(0), mode(0) {
this->ranges.push_back(range);
}
Phase getPhase() const { return static_cast<Phase>(phase); }
void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }
std::string toString() const {
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + range.toString() +
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
" [Phase]: " + std::to_string(static_cast<int>(phase)) +
" [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
return res;
@ -123,7 +132,7 @@ struct DataMoveMetaData {
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, id, version, range, phase, src, dest);
serializer(ar, id, version, ranges, priority, src, dest, checkpoints, phase, mode);
}
};

View File

@ -890,16 +890,16 @@ struct ChangeFeedStreamRequest {
KeyRange range;
int replyBufferSize = -1;
bool canReadPopped = true;
UID debugUID; // This is only used for debugging and tracing, but being able to link a client + server side stream
// is so useful for testing, and this is such small overhead compared to streaming large amounts of
// change feed data, it is left in the interface
UID id; // This must be globally unique among ChangeFeedStreamRequest instances
Optional<ReadOptions> options;
ReplyPromiseStream<ChangeFeedStreamReply> reply;
ChangeFeedStreamRequest() {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, debugUID, arena);
serializer(
ar, rangeID, begin, end, range, reply, spanContext, replyBufferSize, canReadPopped, id, options, arena);
}
};

View File

@ -92,8 +92,6 @@ void decodeKeyServersValue(RangeResult result,
UID& destID,
bool missingIsError = true);
extern const KeyRef clusterIdKey;
extern const KeyRangeRef auditRange;
extern const KeyRef auditPrefix;
const Key auditRangeKey(const AuditType type, const UID& auditId, const KeyRef& key);
@ -505,6 +503,9 @@ extern const KeyRangeRef timeKeeperPrefixRange;
extern const KeyRef timeKeeperVersionKey;
extern const KeyRef timeKeeperDisableKey;
// Durable cluster ID key
extern const KeyRef clusterIdKey;
// Layer status metadata prefix
extern const KeyRangeRef layerStatusMetaPrefixRange;

View File

@ -607,7 +607,7 @@ public:
Key getTagQuotaKey(TransactionTagRef);
template <class Tr>
void setTagQuota(Reference<Tr> tr, TransactionTagRef tag, double reservedQuota, double totalQuota) {
void setTagQuota(Reference<Tr> tr, TransactionTagRef tag, int64_t reservedQuota, int64_t totalQuota) {
TagQuotaValue tagQuotaValue;
tagQuotaValue.reservedQuota = reservedQuota;
tagQuotaValue.totalQuota = totalQuota;

View File

@ -211,6 +211,31 @@ struct TenantMetadata {
};
typedef VersionedMap<TenantName, TenantMapEntry> TenantMap;
class TenantPrefixIndex : public VersionedMap<Key, TenantName>, public ReferenceCounted<TenantPrefixIndex> {};
// A set of tenant names that is generally expected to have one item in it. The set can have more than one item in it
// during certain periods when the set is being updated (e.g. while restoring a backup), but it is expected to have
// one item at the end. It is not possible to use the set while it contains more than one item.
struct TenantNameUniqueSet {
std::unordered_set<TenantName> tenantNames;
// Returns the single tenant name stored in the set
// It is an error to call this function if the set holds more than one name
TenantName get() const {
ASSERT(tenantNames.size() == 1);
return *tenantNames.begin();
}
void insert(TenantName const& name) { tenantNames.insert(name); }
// Removes a tenant name from the set. Returns true if the set is now empty.
bool remove(TenantName const& name) {
auto itr = tenantNames.find(name);
ASSERT(itr != tenantNames.end());
tenantNames.erase(itr);
return tenantNames.empty();
}
};
class TenantPrefixIndex : public VersionedMap<Key, TenantNameUniqueSet>, public ReferenceCounted<TenantPrefixIndex> {};
#endif

View File

@ -68,6 +68,10 @@ using TenantEntryCachePayloadFunc = std::function<TenantEntryCachePayload<T>(con
// 1. Lookup by 'TenantId'
// 2. Lookup by 'TenantPrefix'
// 3. Lookup by 'TenantName'
// TODO: Currently this cache performs poorly if there are tenant access happening to unknown tenants which happens most
// frequently in optional tenant mode but can also happen in required mode if there are alot of tenants created. Further
// as a consequence of the design we cannot be sure that the state of a given tenant is accurate even if its present in
// the cache.
template <class T>
class TenantEntryCache : public ReferenceCounted<TenantEntryCache<T>>, NonCopyable {

View File

@ -205,6 +205,7 @@ public:
Version getCommittedVersion() override;
ThreadFuture<VersionVector> getVersionVector() override;
ThreadFuture<SpanContext> getSpanContext() override;
ThreadFuture<int64_t> getTotalCost() override;
ThreadFuture<int64_t> getApproximateSize() override;
ThreadFuture<uint64_t> getProtocolVersion();

View File

@ -273,17 +273,4 @@ struct ITracer {
virtual void trace(Span const& span) = 0;
};
void openTracer(TracerType type);
template <class T>
struct SpannedDeque : Deque<T> {
Span span;
explicit SpannedDeque(Location loc) : span(loc) {}
SpannedDeque(SpannedDeque&& other) : Deque<T>(std::move(other)), span(std::move(other.span)) {}
SpannedDeque(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque const&) = delete;
SpannedDeque& operator=(SpannedDeque&& other) {
*static_cast<Deque<T>*>(this) = std::move(other);
span = std::move(other.span);
}
};
void openTracer(TracerType type);

View File

@ -77,6 +77,7 @@ template <class T>
class PTreeFinger {
using PTreeFingerEntry = PTree<T> const*;
// This finger size supports trees with up to exp(96/4.3) ~= 4,964,514,749 entries.
// The number 4.3 comes from here: https://en.wikipedia.org/wiki/Random_binary_tree#The_longest_path
// see also: check().
static constexpr size_t N = 96;
PTreeFingerEntry entries_[N];

View File

@ -120,6 +120,8 @@ description is not currently required but encouraged.
<Option name="future_version_client_library" code="66"
paramType="String" paramDescription="path to client library"
description="Adds an external client library to be used with a future version protocol. This option can be used testing purposes only!" />
<Option name="retain_client_library_copies" code="67"
description="Retain temporary external client library copies that are created for enabling multi-threading." />
<Option name="disable_client_statistics_logging" code="70"
description="Disables logging of client statistics, such as sampled transaction activity." />
<Option name="enable_slow_task_profiling" code="71"
@ -279,7 +281,7 @@ description is not currently required but encouraged.
description="Set the transaction size limit in bytes. The size is calculated by combining the sizes of all keys and values written or mutated, all key ranges cleared, and all read and write conflict ranges. (In other words, it includes the total size of all data included in the request to the cluster to commit the transaction.) Large transactions can cause performance problems on FoundationDB clusters, so setting this limit to a smaller value than the default can help prevent the client from accidentally degrading the cluster's performance. This value must be at least 32 and cannot be set to higher than 10,000,000, the default transaction size limit." />
<Option name="idempotency_id" code="504"
paramType="String" paramDescription="Unique ID"
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use."
description="Associate this transaction with this ID for the purpose of checking whether or not this transaction has already committed. Must be at least 16 bytes and less than 256 bytes. This feature is in development and not ready for general use. Unless the automatic_idempotency option is set after this option, the client will not automatically attempt to remove this id from the cluster after a successful commit."
hidden="true" />
<Option name="automatic_idempotency" code="505"
description="Automatically assign a random 16 byte idempotency id for this transaction. Prevents commits from failing with ``commit_unknown_result``. WARNING: If you are also using the multiversion client or transaction timeouts, if either cluster_version_changed or transaction_timed_out was thrown during a commit, then that commit may have already succeeded or may succeed in the future. This feature is in development and not ready for general use."

View File

@ -5,9 +5,8 @@ get_target_property(fdbclient_target_includes fdbclient INCLUDE_DIRECTORIES)
target_link_libraries(fdbmonitor PUBLIC SimpleOpt)
target_include_directories(fdbmonitor PUBLIC "${fdbclient_target_includes}")
strip_debug_symbols(fdbmonitor)
assert_no_version_h(fdbmonitor)
if(UNIX AND NOT APPLE)
target_link_libraries(fdbmonitor PRIVATE rt)
target_link_libraries(fdbmonitor PRIVATE rt)
endif()
# FIXME: This include directory is an ugly hack. We probably want to fix this.
# as soon as we get rid of the old build system
@ -17,17 +16,17 @@ target_link_libraries(fdbmonitor PUBLIC Threads::Threads)
# appears to change its behavior (it no longer seems to restart killed
# processes). fdbmonitor is single-threaded anyway.
get_target_property(fdbmonitor_options fdbmonitor COMPILE_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY COMPILE_OPTIONS ${fdbmonitor_options})
endif ()
endif()
get_target_property(fdbmonitor_options fdbmonitor LINK_OPTIONS)
if (NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
if(NOT "${fdbmonitor_options}" STREQUAL "fdbmonitor_options-NOTFOUND")
list(REMOVE_ITEM fdbmonitor_options "-fsanitize=thread")
set_property(TARGET fdbmonitor PROPERTY LINK_OPTIONS ${fdbmonitor_options})
endif ()
endif()
if(GENERATE_DEBUG_PACKAGES)
fdb_install(TARGETS fdbmonitor DESTINATION fdbmonitor COMPONENT server)
@ -51,7 +50,7 @@ add_custom_target(clean_sandbox
add_custom_target(start_sandbox
COMMAND ${CMAKE_BINARY_DIR}/bin/fdbmonitor --conffile ${CMAKE_BINARY_DIR}/sandbox/foundationdb.conf
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
--lockfile ${CMAKE_BINARY_DIR}/sandbox/fdbmonitor.lock)
add_dependencies(start_sandbox fdbmonitor fdbserver)
@ -61,6 +60,6 @@ if(NOT EXISTS ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh)
endif()
add_custom_target(generate_profile
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
COMMAND ${CMAKE_BINARY_DIR}/contrib/generate_profile.sh ${CMAKE_BINARY_DIR})
add_dependencies(generate_profile fdbmonitor fdbserver mako fdbcli)

View File

@ -133,3 +133,50 @@ Future<Void> CounterCollection::traceCounters(std::string const& traceEventName,
return CounterCollectionImpl::traceCounters(
this, traceEventName, traceEventID, interval, trackLatestName, decorator);
}
void LatencyBands::insertBand(double value) {
bands.emplace(std::make_pair(value, std::make_unique<Counter>(format("Band%f", value), *cc)));
}
FDB_DEFINE_BOOLEAN_PARAM(Filtered);
LatencyBands::LatencyBands(std::string const& name,
UID id,
double loggingInterval,
std::function<void(TraceEvent&)> const& decorator)
: name(name), id(id), loggingInterval(loggingInterval), decorator(decorator) {}
void LatencyBands::addThreshold(double value) {
if (value > 0 && bands.count(value) == 0) {
if (bands.size() == 0) {
ASSERT(!cc && !filteredCount);
cc = std::make_unique<CounterCollection>(name, id.toString());
logger = cc->traceCounters(name, id, loggingInterval, id.toString() + "/" + name, decorator);
filteredCount = std::make_unique<Counter>("Filtered", *cc);
insertBand(std::numeric_limits<double>::infinity());
}
insertBand(value);
}
}
void LatencyBands::addMeasurement(double measurement, int count, Filtered filtered) {
if (filtered && filteredCount) {
(*filteredCount) += count;
} else if (bands.size() > 0) {
auto itr = bands.upper_bound(measurement);
ASSERT(itr != bands.end());
(*itr->second) += count;
}
}
void LatencyBands::clearBands() {
logger = Void();
bands.clear();
filteredCount.reset();
cc.reset();
}
LatencyBands::~LatencyBands() {
clearBands();
}

View File

@ -757,12 +757,18 @@ Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const BasicLoadBalanc
Optional<BasicLoadBalancedReply> getBasicLoadBalancedReply(const void*);
// A simpler version of LoadBalance that does not send second requests where the list of servers are always fresh
//
// If |alternativeChosen| is not null, then atMostOnce must be True, and if the returned future completes successfully
// then *alternativeChosen will be the alternative to which the message was sent. *alternativeChosen must outlive the
// returned future.
ACTOR template <class Interface, class Request, class Multi, bool P>
Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> alternatives,
RequestStream<Request, P> Interface::*channel,
Request request = Request(),
TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint,
AtMostOnce atMostOnce = AtMostOnce::False) {
AtMostOnce atMostOnce = AtMostOnce::False,
int* alternativeChosen = nullptr) {
ASSERT(alternativeChosen == nullptr || atMostOnce == AtMostOnce::True);
setReplyPriority(request, taskID);
if (!alternatives)
return Never();
@ -791,6 +797,9 @@ Future<REPLY_TYPE(Request)> basicLoadBalance(Reference<ModelInterface<Multi>> al
useAlt = (nextAlt + alternatives->size() - 1) % alternatives->size();
stream = &alternatives->get(useAlt, channel);
if (alternativeChosen != nullptr) {
*alternativeChosen = useAlt;
}
if (!IFailureMonitor::failureMonitor().getState(stream->getEndpoint()).failed)
break;
nextAlt = (nextAlt + 1) % alternatives->size();

Some files were not shown because too many files have changed in this diff Show More