Merge branch 'main' of github.com:apple/foundationdb

This commit is contained in:
Ankita Kejriwal 2023-02-06 16:24:47 -08:00
commit 68f3e29a47
238 changed files with 7329 additions and 2453 deletions

View File

@ -17,4 +17,5 @@ if(WITH_RUBY_BINDING)
endif()
if(NOT WIN32 AND NOT OPEN_FOR_IDE)
package_bindingtester()
package_bindingtester2()
endif()

View File

@ -22,10 +22,8 @@
import sys
import subprocess
import struct
import random
import argparse
import math
import os
import copy
import traceback

View File

@ -32,7 +32,9 @@ The tenant API introduces some new operations:
#### TENANT_SET_ACTIVE
Pops the top item off of the stack as TENANT_NAME. Opens the tenant with
name TENANT_NAME and stores it as the active tenant.
name TENANT_NAME and stores it as the active tenant. Then, waits on a future
that initializes the tenant ID. When complete, pushes the string
"SET_ACTIVE_TENANT" onto the stack.
#### TENANT_CLEAR_ACTIVE
@ -46,6 +48,12 @@ The tenant API introduces some new operations:
packed into a tuple as [t1,t2,t3,...,tn], and this single packed value
is pushed onto the stack.
#### TENANT_GET_ID
Attempts to resolve the active tenant's ID. Pushes the string "GOT_TENANT_ID" onto
the stack if an ID was successfully read after waiting on the ID future. Pushes the string
"NO_ACTIVE_TENANT" if there is no active tenant.
Updates to Existing Instructions
--------------------------------

View File

@ -175,7 +175,7 @@ class ApiTest(Test):
write_conflicts = ['WRITE_CONFLICT_RANGE', 'WRITE_CONFLICT_KEY', 'DISABLE_WRITE_CONFLICT']
txn_sizes = ['GET_APPROXIMATE_SIZE']
storage_metrics = ['GET_ESTIMATED_RANGE_SIZE', 'GET_RANGE_SPLIT_POINTS']
tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST']
tenants = ['TENANT_CREATE', 'TENANT_DELETE', 'TENANT_SET_ACTIVE', 'TENANT_CLEAR_ACTIVE', 'TENANT_LIST', 'TENANT_GET_ID']
op_choices += reads
op_choices += mutations
@ -610,6 +610,7 @@ class ApiTest(Test):
tenant_name = self.choose_tenant(0.8)
instructions.push_args(tenant_name)
instructions.append(op)
self.add_strings(1)
elif op == 'TENANT_CLEAR_ACTIVE':
instructions.append(op)
elif op == 'TENANT_LIST':
@ -619,6 +620,9 @@ class ApiTest(Test):
test_util.to_front(instructions, 2)
instructions.append(op)
self.add_strings(1)
elif op == "TENANT_GET_ID":
instructions.append(op)
self.add_strings(1)
else:
assert False, 'Unknown operation: ' + op

View File

@ -217,8 +217,8 @@ if(NOT WIN32)
target_link_libraries(fdb_c_unit_tests_version_510 PRIVATE fdb_c Threads::Threads doctest)
target_link_libraries(trace_partial_file_suffix_test PRIVATE fdb_c Threads::Threads flow doctest)
target_link_libraries(disconnected_timeout_unit_tests PRIVATE fdb_c Threads::Threads doctest)
target_link_libraries(fdb_c_client_config_tester PRIVATE SimpleOpt fdb_cpp fdb_c Threads::Threads fmt::fmt)
target_include_directories(fdb_c_client_config_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
target_link_libraries(fdb_c_client_config_tester PRIVATE SimpleOpt fdb_cpp fdb_c fdbclient Threads::Threads fmt::fmt)
target_include_directories(fdb_c_client_config_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/)
# do not set RPATH for mako
set_property(TARGET mako PROPERTY SKIP_BUILD_RPATH TRUE)
@ -423,18 +423,18 @@ if(OPEN_FOR_IDE)
target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer only
elseif(NOT WIN32 AND NOT APPLE) # Linux Only
set(SHIM_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
set(SHIM_LIB_GEN_SRC
${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.c
${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.init.cpp
${SHIM_LIB_OUTPUT_DIR}/libfdb_c.so.tramp.S)
set(IMPLIBSO_SRC_DIR ${CMAKE_SOURCE_DIR}/contrib/Implib.so)
set(IMPLIBSO_SRC
${IMPLIBSO_SRC_DIR}/implib-gen.py
${IMPLIBSO_SRC_DIR}/arch/common/init.c.tpl
${IMPLIBSO_SRC_DIR}/arch/common/init.cpp.tpl
${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/config.ini
${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/table.S.tpl
${IMPLIBSO_SRC_DIR}/arch/${CMAKE_SYSTEM_PROCESSOR}/trampoline.S.tpl
@ -467,6 +467,11 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
target_link_libraries(fdb_c_shim_lib_tester PRIVATE fdb_c_shim SimpleOpt fdb_cpp Threads::Threads)
target_include_directories(fdb_c_shim_lib_tester PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/foundationdb/ ${CMAKE_SOURCE_DIR}/flow/include)
set(SHIM_LIB_TEST_EXTRA_OPTIONS "")
if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR USE_SANITIZER)
list(APPEND SHIM_LIB_TEST_EXTRA_OPTIONS --disable-prev-version-tests)
endif()
add_python_venv_test(NAME fdb_c_shim_library_tests
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/test/fdb_c_shim_tests.py
--build-dir ${CMAKE_BINARY_DIR}
@ -474,6 +479,7 @@ elseif(NOT WIN32 AND NOT APPLE AND NOT USE_SANITIZER) # Linux Only, non-santizer
--api-tester-bin $<TARGET_FILE:fdb_c_shim_api_tester>
--shim-lib-tester-bin $<TARGET_FILE:fdb_c_shim_lib_tester>
--api-test-dir ${CMAKE_SOURCE_DIR}/bindings/c/test/apitester/tests
${SHIM_LIB_TEST_EXTRA_OPTIONS}
)
endif() # End Linux only, non-sanitizer only

View File

@ -18,6 +18,8 @@
* limitations under the License.
*/
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "fdbclient/FDBTypes.h"
#include "flow/ProtocolVersion.h"
#include <cstdint>
@ -61,6 +63,11 @@ int g_api_version = 0;
/* This must be true so that we can return the data pointer of a
Standalone<RangeResultRef> as an array of FDBKeyValue. */
static_assert(sizeof(FDBKeyValue) == sizeof(KeyValueRef), "FDBKeyValue / KeyValueRef size mismatch");
static_assert(sizeof(FDBBGMutation) == sizeof(GranuleMutationRef), "FDBBGMutation / GranuleMutationRef size mismatch");
static_assert(static_cast<int>(FDB_BG_MUTATION_TYPE_SET_VALUE) == static_cast<int>(MutationRef::Type::SetValue),
"FDB_BG_MUTATION_TYPE_SET_VALUE enum value mismatch");
static_assert(static_cast<int>(FDB_BG_MUTATION_TYPE_CLEAR_RANGE) == static_cast<int>(MutationRef::Type::ClearRange),
"FDB_BG_MUTATION_TYPE_CLEAR_RANGE enum value mismatch");
#define TSAV_ERROR(type, error) ((FDBFuture*)(ThreadFuture<type>(error())).extractPtr())
@ -333,6 +340,99 @@ extern "C" DLLEXPORT fdb_error_t fdb_future_get_granule_summary_array(FDBFuture*
*out_count = na.size(););
}
namespace {
void setBlobFilePointer(FDBBGFilePointer* dest, const BlobFilePointerRef& source) {
dest->filename_ptr = source.filename.begin();
dest->filename_length = source.filename.size();
dest->file_offset = source.offset;
dest->file_length = source.length;
dest->full_file_length = source.fullFileLength;
}
void setBGMutation(FDBBGMutation* dest, int64_t version, const MutationRef& source) {
dest->version = version;
dest->type = source.type;
dest->param1_ptr = source.param1.begin();
dest->param1_length = source.param1.size();
dest->param2_ptr = source.param2.begin();
dest->param2_length = source.param2.size();
}
void setBGMutations(FDBBGMutation** mutationsOut, int* mutationCountOut, Arena& ar, const GranuleDeltas& deltas) {
// convert mutations from MutationsAndVersionRef to single mutations
int mutationCount = 0;
for (auto& it : deltas) {
mutationCount += it.mutations.size();
}
*mutationCountOut = mutationCount;
if (mutationCount > 0) {
*mutationsOut = new (ar) FDBBGMutation[mutationCount];
mutationCount = 0;
for (auto& it : deltas) {
for (auto& m : it.mutations) {
setBGMutation(&((*mutationsOut)[mutationCount]), it.version, m);
mutationCount++;
}
}
ASSERT(mutationCount == *mutationCountOut);
}
}
} // namespace
extern "C" DLLEXPORT fdb_error_t fdb_future_readbg_get_descriptions(FDBFuture* f,
FDBBGFileDescription** out,
int* desc_count) {
CATCH_AND_RETURN(Standalone<VectorRef<BlobGranuleChunkRef>> results =
TSAV(Standalone<VectorRef<BlobGranuleChunkRef>>, f)->get();
*desc_count = results.size();
Arena ar;
*out = new (ar) FDBBGFileDescription[results.size()];
for (int chunkIdx = 0; chunkIdx < results.size(); chunkIdx++) {
BlobGranuleChunkRef& chunk = results[chunkIdx];
FDBBGFileDescription& desc = (*out)[chunkIdx];
// set key range
desc.key_range.begin_key = chunk.keyRange.begin.begin();
desc.key_range.begin_key_length = chunk.keyRange.begin.size();
desc.key_range.end_key = chunk.keyRange.end.begin();
desc.key_range.end_key_length = chunk.keyRange.end.size();
// snapshot file
desc.snapshot_present = chunk.snapshotFile.present();
if (desc.snapshot_present) {
setBlobFilePointer(&desc.snapshot_file_pointer, chunk.snapshotFile.get());
}
// delta files
desc.delta_file_count = chunk.deltaFiles.size();
if (chunk.deltaFiles.size()) {
desc.delta_files = new (ar) FDBBGFilePointer[chunk.deltaFiles.size()];
for (int d = 0; d < chunk.deltaFiles.size(); d++) {
setBlobFilePointer(&desc.delta_files[d], chunk.deltaFiles[d]);
}
}
setBGMutations(&desc.memory_mutations, &desc.memory_mutation_count, ar, chunk.newDeltas);
}
// make this memory owned by the arena of the object stored in the future
results.arena()
.dependsOn(ar););
}
extern "C" DLLEXPORT FDBResult* fdb_readbg_parse_snapshot_file(const uint8_t* file_data, int file_len) {
RETURN_RESULT_ON_ERROR(RangeResult,
RangeResult parsedSnapshotData = bgReadSnapshotFile(StringRef(file_data, file_len));
return ((FDBResult*)(ThreadResult<RangeResult>(parsedSnapshotData)).extractPtr()););
}
extern "C" DLLEXPORT FDBResult* fdb_readbg_parse_delta_file(const uint8_t* file_data, int file_len) {
RETURN_RESULT_ON_ERROR(
Standalone<VectorRef<GranuleMutationRef>>,
Standalone<VectorRef<GranuleMutationRef>> parsedDeltaData = bgReadDeltaFile(StringRef(file_data, file_len));
return ((FDBResult*)(ThreadResult<Standalone<VectorRef<GranuleMutationRef>>>(parsedDeltaData)).extractPtr()););
}
extern "C" DLLEXPORT void fdb_result_destroy(FDBResult* r) {
CATCH_AND_DIE(TSAVB(r)->cancel(););
}
@ -346,6 +446,13 @@ fdb_error_t fdb_result_get_keyvalue_array(FDBResult* r,
*out_more = rr.more;);
}
fdb_error_t fdb_result_get_bg_mutations_array(FDBResult* r, FDBBGMutation const** out_mutations, int* out_count) {
CATCH_AND_RETURN(Standalone<VectorRef<GranuleMutationRef>> mutations =
TSAV(Standalone<VectorRef<GranuleMutationRef>>, r)->get();
*out_mutations = (FDBBGMutation*)mutations.begin();
*out_count = mutations.size(););
}
FDBFuture* fdb_create_cluster_v609(const char* cluster_file_path) {
char* path;
if (cluster_file_path) {
@ -1088,6 +1195,28 @@ extern "C" DLLEXPORT FDBFuture* fdb_transaction_summarize_blob_granules(FDBTrans
return (FDBFuture*)(TXN(tr)->summarizeBlobGranules(range, sv, rangeLimit).extractPtr()););
}
// copied from read_blob_granules_start
extern "C" DLLEXPORT FDBFuture* fdb_transaction_read_blob_granules_description(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t begin_version,
int64_t read_version,
int64_t* read_version_out) {
Optional<Version> rv;
if (read_version != latestVersion) {
rv = read_version;
}
return (FDBFuture*)(TXN(tr)
->readBlobGranulesStart(KeyRangeRef(KeyRef(begin_key_name, begin_key_name_length),
KeyRef(end_key_name, end_key_name_length)),
begin_version,
rv,
read_version_out)
.extractPtr());
}
#include "fdb_c_function_pointers.g.h"
#define FDB_API_CHANGED(func, ver) \

View File

@ -214,6 +214,39 @@ typedef struct readgranulecontext {
int granuleParallelism;
} FDBReadBlobGranuleContext;
typedef struct bgfilepointer {
const uint8_t* filename_ptr;
int filename_length;
int64_t file_offset;
int64_t file_length;
int64_t full_file_length;
/* TODO: encryption keys would go here */
} FDBBGFilePointer;
typedef enum { FDB_BG_MUTATION_TYPE_SET_VALUE = 0, FDB_BG_MUTATION_TYPE_CLEAR_RANGE = 1 } FDBBGMutationType;
#pragma pack(push, 4)
typedef struct bgmutation {
/* FDBBGMutationType */ uint8_t type;
int64_t version;
const uint8_t* param1_ptr;
int param1_length;
const uint8_t* param2_ptr;
int param2_length;
} FDBBGMutation;
typedef struct bgfiledescription {
FDBKeyRange key_range;
fdb_bool_t snapshot_present;
FDBBGFilePointer snapshot_file_pointer;
int delta_file_count;
FDBBGFilePointer* delta_files;
int memory_mutation_count;
FDBBGMutation* memory_mutations;
/* TODO: tenant info would go here */
} FDBBGFileDescription;
#pragma pack(pop)
DLLEXPORT void fdb_future_cancel(FDBFuture* f);
DLLEXPORT void fdb_future_release_memory(FDBFuture* f);
@ -275,6 +308,15 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_get_granule_summary_array(FD
FDBGranuleSummary const** out_summaries,
int* out_count);
/* all for using future result from read_blob_granules_description */
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_future_readbg_get_descriptions(FDBFuture* f,
FDBBGFileDescription** out,
int* desc_count);
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_readbg_parse_snapshot_file(const uint8_t* file_data, int file_len);
DLLEXPORT WARN_UNUSED_RESULT FDBResult* fdb_readbg_parse_delta_file(const uint8_t* file_data, int file_len);
/* FDBResult is a synchronous computation result, as opposed to a future that is asynchronous. */
DLLEXPORT void fdb_result_destroy(FDBResult* r);
@ -283,6 +325,10 @@ DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_result_get_keyvalue_array(FDBResult
int* out_count,
fdb_bool_t* out_more);
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_result_get_bg_mutations_array(FDBResult* r,
FDBBGMutation const** out_mutations,
int* out_count);
/* TODO: add other return types as we need them */
DLLEXPORT WARN_UNUSED_RESULT fdb_error_t fdb_create_database(const char* cluster_file_path, FDBDatabase** out_database);
@ -582,6 +628,15 @@ DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_summarize_blob_granules(
int64_t summaryVersion,
int rangeLimit);
DLLEXPORT WARN_UNUSED_RESULT FDBFuture* fdb_transaction_read_blob_granules_description(FDBTransaction* tr,
uint8_t const* begin_key_name,
int begin_key_name_length,
uint8_t const* end_key_name,
int end_key_name_length,
int64_t begin_version,
int64_t read_version,
int64_t* read_version_out);
#define FDB_KEYSEL_LAST_LESS_THAN(k, l) k, l, 0, 0
#define FDB_KEYSEL_LAST_LESS_OR_EQUAL(k, l) k, l, 1, 0
#define FDB_KEYSEL_FIRST_GREATER_THAN(k, l) k, l, 1, 1

View File

@ -70,12 +70,15 @@ void ApiWorkload::start() {
schedule([this]() {
// 1. Clear data
clearData([this]() {
// 2. Workload setup
setup([this]() {
// 3. Populate initial data
populateData([this]() {
// 4. Generate random workload
runTests();
// 2. Create tenants if necessary.
createTenantsIfNecessary([this] {
// 3. Workload setup.
setup([this]() {
// 4. Populate initial data
populateData([this]() {
// 5. Generate random workload
runTests();
});
});
});
});
@ -152,6 +155,21 @@ fdb::Key ApiWorkload::randomKey(double existingKeyRatio, std::optional<int> tena
}
}
fdb::KeyRange ApiWorkload::randomNonEmptyKeyRange() {
fdb::KeyRange keyRange;
keyRange.beginKey = randomKeyName();
// avoid empty key range
do {
keyRange.endKey = randomKeyName();
} while (keyRange.beginKey == keyRange.endKey);
if (keyRange.beginKey > keyRange.endKey) {
std::swap(keyRange.beginKey, keyRange.endKey);
}
ASSERT(keyRange.beginKey < keyRange.endKey);
return keyRange;
}
std::optional<int> ApiWorkload::randomTenant() {
if (tenants.size() > 0) {
return Random::get().randomInt(0, tenants.size() - 1);
@ -244,9 +262,17 @@ void ApiWorkload::createTenants(TTaskFct cont) {
[this, cont]() { schedule(cont); });
}
void ApiWorkload::createTenantsIfNecessary(TTaskFct cont) {
if (tenants.size() > 0) {
createTenants(cont);
} else {
schedule(cont);
}
}
void ApiWorkload::populateData(TTaskFct cont) {
if (tenants.size() > 0) {
createTenants([this, cont]() { populateTenantData(cont, std::make_optional(0)); });
populateTenantData(cont, std::make_optional(0));
} else {
populateTenantData(cont, {});
}

View File

@ -113,6 +113,7 @@ protected:
fdb::Key randomNotExistingKey(std::optional<int> tenantId);
fdb::Key randomExistingKey(std::optional<int> tenantId);
fdb::Key randomKey(double existingKeyRatio, std::optional<int> tenantId);
fdb::KeyRange randomNonEmptyKeyRange();
// Chooses a random tenant from the available tenants (or an empty optional if tenants aren't used in the test)
std::optional<int> randomTenant();
@ -140,6 +141,7 @@ private:
void populateDataTx(TTaskFct cont, std::optional<int> tenantId);
void populateTenantData(TTaskFct cont, std::optional<int> tenantId);
void createTenants(TTaskFct cont);
void createTenantsIfNecessary(TTaskFct cont);
void clearTenantData(TTaskFct cont, std::optional<int> tenantId);

View File

@ -21,6 +21,8 @@
#include "TesterBlobGranuleUtil.h"
#include "TesterUtil.h"
#include <unordered_set>
#include <set>
#include "fdb_api.hpp"
#include <memory>
#include <fmt/format.h>
@ -38,7 +40,7 @@ public:
}
private:
// FIXME: add tenant support for DB operations
// FIXME: use other new blob granule apis!
enum OpType {
OP_INSERT,
OP_CLEAR,
@ -48,84 +50,63 @@ private:
OP_SUMMARIZE,
OP_GET_BLOB_RANGES,
OP_VERIFY,
OP_LAST = OP_VERIFY
OP_READ_DESC,
OP_LAST = OP_READ_DESC
};
std::vector<OpType> excludedOpTypes;
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
std::set<fdb::ByteString> validatedFiles;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
void debugOp(std::string opName, fdb::KeyRange keyRange, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(keyRange.beginKey),
fdb::toCharsRef(keyRange.endKey),
debugTenantStr(tenantId),
message));
}
}
void randomReadOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
if (begin > end) {
std::swap(begin, end);
}
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
auto results = std::make_shared<std::vector<fdb::KeyValue>>();
auto tooOld = std::make_shared<bool>(false);
debugOp("Read", begin, end, tenantId, "starting");
debugOp("Read", keyRange, tenantId, "starting");
execTransaction(
[this, begin, end, tenantId, results, tooOld](auto ctx) {
[this, keyRange, tenantId, results, tooOld](auto ctx) {
ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
TesterGranuleContext testerContext(ctx->getBGBasePath());
fdb::native::FDBReadBlobGranuleContext granuleContext = createGranuleContext(&testerContext);
fdb::Result res = ctx->tx().readBlobGranules(
begin, end, 0 /* beginVersion */, -2 /* latest read version */, granuleContext);
fdb::Result res = ctx->tx().readBlobGranules(keyRange.beginKey,
keyRange.endKey,
0 /* beginVersion */,
-2 /* latest read version */,
granuleContext);
auto out = fdb::Result::KeyValueRefArray{};
fdb::Error err = res.getKeyValueArrayNothrow(out);
if (err.code() == error_code_blob_granule_transaction_too_old) {
bool previousSuccess = seenReadSuccess(tenantId);
if (previousSuccess) {
error("Read bg too old after read success!\n");
} else {
info("Read bg too old\n");
}
ASSERT(!previousSuccess);
*tooOld = true;
ctx->done();
} else if (err.code() != error_code_success) {
ASSERT(err.code() != error_code_blob_granule_transaction_too_old);
if (err.code() != error_code_success) {
ctx->onError(err);
} else {
auto resCopy = copyKeyValueArray(out);
auto& [resVector, out_more] = resCopy;
ASSERT(!out_more);
results.get()->assign(resVector.begin(), resVector.end());
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
}
debugOp("Read", keyRange, tenantId, "complete");
ctx->done();
}
},
[this, begin, end, results, tooOld, cont, tenantId]() {
[this, keyRange, results, tooOld, cont, tenantId]() {
if (!*tooOld) {
std::vector<fdb::KeyValue> expected =
stores[tenantId].getRange(begin, end, stores[tenantId].size(), false);
stores[tenantId].getRange(keyRange.beginKey, keyRange.endKey, stores[tenantId].size(), false);
if (results->size() != expected.size()) {
error(fmt::format("randomReadOp result size mismatch. expected: {0} actual: {1}",
expected.size(),
@ -161,18 +142,14 @@ private:
}
void randomGetGranulesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
if (begin > end) {
std::swap(begin, end);
}
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
debugOp("GetGranules", begin, end, tenantId, "starting");
debugOp("GetGranules", keyRange, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().getBlobGranuleRanges(begin, end, 1000).eraseType();
[keyRange, results](auto ctx) {
fdb::Future f = ctx->tx().getBlobGranuleRanges(keyRange.beginKey, keyRange.endKey, 1000).eraseType();
ctx->continueAfter(
f,
[ctx, f, results]() {
@ -181,34 +158,26 @@ private:
},
true);
},
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetGranules", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
[this, keyRange, tenantId, results, cont]() {
debugOp("GetGranules", keyRange, tenantId, fmt::format("complete with {0} granules", results->size()));
this->validateRanges(results, keyRange);
schedule(cont);
},
getTenant(tenantId));
}
void randomSummarizeOp(TTaskFct cont, std::optional<int> tenantId) {
if (!seenReadSuccess(tenantId)) {
// tester can't handle this throwing bg_txn_too_old, so just don't call it unless we have already seen a
// read success
schedule(cont);
return;
}
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
if (begin > end) {
std::swap(begin, end);
}
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
auto results = std::make_shared<std::vector<fdb::GranuleSummary>>();
debugOp("Summarize", begin, end, tenantId, "starting");
debugOp("Summarize", keyRange, tenantId, "starting");
execTransaction(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->tx().summarizeBlobGranules(begin, end, -2 /*latest version*/, 1000).eraseType();
[keyRange, results](auto ctx) {
fdb::Future f =
ctx->tx()
.summarizeBlobGranules(keyRange.beginKey, keyRange.endKey, -2 /*latest version*/, 1000)
.eraseType();
ctx->continueAfter(
f,
[ctx, f, results]() {
@ -217,8 +186,8 @@ private:
},
true);
},
[this, begin, end, tenantId, results, cont]() {
debugOp("Summarize", begin, end, tenantId, fmt::format("complete with {0} granules", results->size()));
[this, keyRange, tenantId, results, cont]() {
debugOp("Summarize", keyRange, tenantId, fmt::format("complete with {0} granules", results->size()));
// use validateRanges to share validation
auto ranges = std::make_shared<std::vector<fdb::KeyRange>>();
@ -233,39 +202,35 @@ private:
ranges->push_back((*results)[i].keyRange);
}
this->validateRanges(ranges, begin, end, true);
this->validateRanges(ranges, keyRange);
schedule(cont);
},
getTenant(tenantId));
}
void validateRanges(std::shared_ptr<std::vector<fdb::KeyRange>> results,
fdb::Key begin,
fdb::Key end,
bool shouldBeRanges) {
if (shouldBeRanges) {
if (results->size() == 0) {
error(fmt::format(
"ValidateRanges: [{0} - {1}): No ranges returned!", fdb::toCharsRef(begin), fdb::toCharsRef(end)));
}
ASSERT(results->size() > 0);
if (results->front().beginKey > begin || results->back().endKey < end) {
error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(results->front().beginKey),
fdb::toCharsRef(results->back().endKey)));
}
ASSERT(results->front().beginKey <= begin);
ASSERT(results->back().endKey >= end);
void validateRanges(std::shared_ptr<std::vector<fdb::KeyRange>> results, fdb::KeyRange keyRange) {
if (results->size() == 0) {
error(fmt::format("ValidateRanges: [{0} - {1}): No ranges returned!",
fdb::toCharsRef(keyRange.beginKey),
fdb::toCharsRef(keyRange.endKey)));
}
ASSERT(results->size() > 0);
if (results->front().beginKey > keyRange.beginKey || results->back().endKey < keyRange.endKey) {
error(fmt::format("ValidateRanges: [{0} - {1}): Incomplete range(s) returned [{2} - {3})!",
fdb::toCharsRef(keyRange.beginKey),
fdb::toCharsRef(keyRange.endKey),
fdb::toCharsRef(results->front().beginKey),
fdb::toCharsRef(results->back().endKey)));
}
ASSERT(results->front().beginKey <= keyRange.beginKey);
ASSERT(results->back().endKey >= keyRange.endKey);
for (int i = 0; i < results->size(); i++) {
// no empty or inverted ranges
if ((*results)[i].beginKey >= (*results)[i].endKey) {
error(fmt::format("ValidateRanges: [{0} - {1}): Empty/inverted range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(keyRange.beginKey),
fdb::toCharsRef(keyRange.endKey),
fdb::toCharsRef((*results)[i].beginKey),
fdb::toCharsRef((*results)[i].endKey)));
}
@ -276,8 +241,8 @@ private:
// ranges contain entire requested key range
if ((*results)[i].beginKey != (*results)[i].endKey) {
error(fmt::format("ValidateRanges: [{0} - {1}): Non-covereed range [{2} - {3})",
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
fdb::toCharsRef(keyRange.beginKey),
fdb::toCharsRef(keyRange.endKey),
fdb::toCharsRef((*results)[i - 1].endKey),
fdb::toCharsRef((*results)[i].endKey)));
}
@ -287,27 +252,24 @@ private:
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
if (begin > end) {
std::swap(begin, end);
}
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
debugOp("GetBlobRanges", keyRange, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
[keyRange, results](auto ctx) {
fdb::Future f =
ctx->dbOps()->listBlobbifiedRanges(keyRange.beginKey, keyRange.endKey, 1000).eraseType();
ctx->continueAfter(f, [ctx, f, results]() {
*results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
ctx->done();
});
},
[this, begin, end, tenantId, results, cont]() {
debugOp(
"GetBlobRanges", begin, end, tenantId, fmt::format("complete with {0} ranges", results->size()));
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
[this, keyRange, tenantId, results, cont]() {
debugOp("GetBlobRanges", keyRange, tenantId, fmt::format("complete with {0} ranges", results->size()));
this->validateRanges(results, keyRange);
schedule(cont);
},
getTenant(tenantId),
@ -316,38 +278,214 @@ private:
// TODO: tenant support
void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
if (begin > end) {
std::swap(begin, end);
}
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
debugOp("Verify", begin, end, tenantId, "starting");
debugOp("Verify", keyRange, tenantId, "starting");
auto verifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[begin, end, verifyVersion](auto ctx) {
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
[keyRange, verifyVersion](auto ctx) {
fdb::Future f = ctx->dbOps()
->verifyBlobRange(keyRange.beginKey, keyRange.endKey, -2 /* latest version*/)
.eraseType();
ctx->continueAfter(f, [ctx, verifyVersion, f]() {
*verifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
});
},
[this, begin, end, tenantId, verifyVersion, cont]() {
debugOp("Verify", begin, end, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
bool previousSuccess = seenReadSuccess(tenantId);
if (*verifyVersion == -1) {
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
}
[this, keyRange, tenantId, verifyVersion, cont]() {
debugOp("Verify", keyRange, tenantId, fmt::format("Complete @ {0}", *verifyVersion));
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
void validateSnapshotData(std::shared_ptr<ITransactionContext> ctx,
fdb::native::FDBReadBlobGranuleContext& bgCtx,
fdb::GranuleFilePointer snapshotFile,
fdb::KeyRange keyRange) {
if (validatedFiles.contains(snapshotFile.filename)) {
return;
}
validatedFiles.insert(snapshotFile.filename);
int64_t snapshotLoadId = bgCtx.start_load_f((const char*)(snapshotFile.filename.data()),
snapshotFile.filename.size(),
snapshotFile.offset,
snapshotFile.length,
snapshotFile.fullFileLength,
bgCtx.userContext);
fdb::BytesRef snapshotData(bgCtx.get_load_f(snapshotLoadId, bgCtx.userContext), snapshotFile.length);
fdb::Result snapshotRes = ctx->tx().parseSnapshotFile(snapshotData);
auto out = fdb::Result::KeyValueRefArray{};
fdb::Error err = snapshotRes.getKeyValueArrayNothrow(out);
ASSERT(err.code() == error_code_success);
auto res = copyKeyValueArray(out);
bgCtx.free_load_f(snapshotLoadId, bgCtx.userContext);
ASSERT(res.second == false);
for (int i = 0; i < res.first.size(); i++) {
ASSERT(res.first[i].key >= keyRange.beginKey);
ASSERT(res.first[i].key < keyRange.endKey);
if (i > 0) {
ASSERT(res.first[i - 1].key < res.first[i].key);
}
// TODO add snapshot rows to map
}
}
void validateDeltaData(std::shared_ptr<ITransactionContext> ctx,
fdb::native::FDBReadBlobGranuleContext& bgCtx,
fdb::GranuleFilePointer deltaFile,
fdb::KeyRange keyRange,
int64_t& lastDFMaxVersion) {
if (validatedFiles.contains(deltaFile.filename)) {
return;
}
validatedFiles.insert(deltaFile.filename);
int64_t deltaLoadId = bgCtx.start_load_f((const char*)(deltaFile.filename.data()),
deltaFile.filename.size(),
deltaFile.offset,
deltaFile.length,
deltaFile.fullFileLength,
bgCtx.userContext);
fdb::BytesRef deltaData(bgCtx.get_load_f(deltaLoadId, bgCtx.userContext), deltaFile.length);
fdb::Result deltaRes = ctx->tx().parseDeltaFile(deltaData);
auto out = fdb::Result::GranuleMutationRefArray{};
fdb::Error err = deltaRes.getGranuleMutationArrayNothrow(out);
ASSERT(err.code() == error_code_success);
auto res = copyGranuleMutationArray(out);
bgCtx.free_load_f(deltaLoadId, bgCtx.userContext);
int64_t thisDFMaxVersion = 0;
for (int j = 0; j < res.size(); j++) {
fdb::GranuleMutation& m = res[j];
ASSERT(m.version > 0);
ASSERT(m.version > lastDFMaxVersion);
// mutations in delta files aren't necessarily in version order, so just validate ordering w.r.t
// previous file(s)
thisDFMaxVersion = std::max(thisDFMaxVersion, m.version);
ASSERT(m.type == 0 || m.type == 1);
ASSERT(keyRange.beginKey <= m.param1);
ASSERT(m.param1 < keyRange.endKey);
if (m.type == 1) {
ASSERT(keyRange.beginKey <= m.param2);
ASSERT(m.param2 <= keyRange.endKey);
}
}
lastDFMaxVersion = std::max(lastDFMaxVersion, thisDFMaxVersion);
// TODO have delta mutations update map
}
void validateBGDescriptionData(std::shared_ptr<ITransactionContext> ctx,
fdb::native::FDBReadBlobGranuleContext& bgCtx,
fdb::GranuleDescription desc,
fdb::KeyRange keyRange,
int64_t readVersion) {
ASSERT(desc.keyRange.beginKey < desc.keyRange.endKey);
// beginVersion of zero means snapshot present
// validate snapshot file
ASSERT(desc.snapshotFile.has_value());
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("Loading snapshot file {0}\n", fdb::toCharsRef(desc.snapshotFile->filename)));
}
validateSnapshotData(ctx, bgCtx, *desc.snapshotFile, desc.keyRange);
// validate delta files
int64_t lastDFMaxVersion = 0;
for (int i = 0; i < desc.deltaFiles.size(); i++) {
validateDeltaData(ctx, bgCtx, desc.deltaFiles[i], desc.keyRange, lastDFMaxVersion);
}
// validate memory mutations
int64_t lastVersion = 0;
for (int i = 0; i < desc.memoryMutations.size(); i++) {
fdb::GranuleMutation& m = desc.memoryMutations[i];
ASSERT(m.type == 0 || m.type == 1);
ASSERT(m.version > 0);
ASSERT(m.version >= lastVersion);
ASSERT(m.version <= readVersion);
lastVersion = m.version;
ASSERT(m.type == 0 || m.type == 1);
ASSERT(desc.keyRange.beginKey <= m.param1);
ASSERT(m.param1 < desc.keyRange.endKey);
if (m.type == 1) {
ASSERT(desc.keyRange.beginKey <= m.param2);
ASSERT(m.param2 <= desc.keyRange.endKey);
}
// TODO have delta mutations update map
}
// TODO: validate map against data store
}
void validateBlobGranuleDescriptions(std::shared_ptr<ITransactionContext> ctx,
std::vector<fdb::GranuleDescription> results,
fdb::KeyRange keyRange,
std::optional<int> tenantId,
int64_t readVersion) {
ASSERT(!results.empty());
if (tenantId) {
// FIXME: support tenants!!
info("Skipping validation because of tenant.");
return;
}
ASSERT(results.front().keyRange.beginKey <= keyRange.beginKey);
ASSERT(keyRange.endKey <= results.back().keyRange.endKey);
for (int i = 0; i < results.size() - 1; i++) {
ASSERT(results[i].keyRange.endKey == results[i + 1].keyRange.beginKey);
}
TesterGranuleContext testerContext(ctx->getBGBasePath());
fdb::native::FDBReadBlobGranuleContext bgCtx = createGranuleContext(&testerContext);
for (int i = 0; i < results.size(); i++) {
validateBGDescriptionData(ctx, bgCtx, results[i], keyRange, readVersion);
}
}
void randomReadDescription(TTaskFct cont, std::optional<int> tenantId) {
fdb::KeyRange keyRange = randomNonEmptyKeyRange();
auto results = std::make_shared<std::vector<fdb::GranuleDescription>>();
auto readVersionOut = std::make_shared<int64_t>();
debugOp("ReadDesc", keyRange, tenantId, "starting");
execTransaction(
[this, keyRange, tenantId, results, readVersionOut](auto ctx) {
ctx->tx().setOption(FDB_TR_OPTION_READ_YOUR_WRITES_DISABLE);
int64_t* rvo = (int64_t*)readVersionOut.get();
fdb::Future f =
ctx->tx().readBlobGranulesDescription(keyRange.beginKey, keyRange.endKey, 0, -2, rvo).eraseType();
ctx->continueAfter(
f,
[this, ctx, keyRange, tenantId, results, readVersionOut, f]() {
*results = copyGranuleDescriptionArray(f.get<fdb::future_var::GranuleDescriptionRefArray>());
this->validateBlobGranuleDescriptions(ctx, *results, keyRange, tenantId, *readVersionOut);
ctx->done();
},
true);
},
[this, keyRange, tenantId, results, readVersionOut, cont]() {
debugOp("ReadDesc",
keyRange,
tenantId,
fmt::format("complete @ {0} with {1} granules", *readVersionOut, results->size()));
schedule(cont);
},
getTenant(tenantId));
}
void randomOperation(TTaskFct cont) override {
std::optional<int> tenantId = randomTenant();
@ -381,6 +519,9 @@ private:
case OP_VERIFY:
randomVerifyOp(cont, tenantId);
break;
case OP_READ_DESC:
randomReadDescription(cont, tenantId);
break;
}
}
};

View File

@ -112,6 +112,30 @@ GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefAr
return out;
};
GranuleDescriptionArray copyGranuleDescriptionArray(fdb::future_var::GranuleDescriptionRefArray::Type array) {
auto& [in_desc, in_count] = array;
GranuleDescriptionArray out;
for (int i = 0; i < in_count; ++i) {
fdb::native::FDBBGFileDescription nativeDesc = *in_desc++;
out.emplace_back(nativeDesc);
}
return out;
};
GranuleMutationArray copyGranuleMutationArray(fdb::future_var::GranuleMutationRefArray::Type array) {
auto& [in_mutations, in_count] = array;
GranuleMutationArray out;
for (int i = 0; i < in_count; ++i) {
fdb::native::FDBBGMutation nativeMutation = *in_mutations++;
out.emplace_back(nativeMutation);
}
return out;
};
TmpFile::~TmpFile() {
if (!filename.empty()) {
remove();

View File

@ -136,6 +136,12 @@ KeyRangeArray copyKeyRangeArray(fdb::future_var::KeyRangeRefArray::Type array);
using GranuleSummaryArray = std::vector<fdb::GranuleSummary>;
GranuleSummaryArray copyGranuleSummaryArray(fdb::future_var::GranuleSummaryRefArray::Type array);
using GranuleDescriptionArray = std::vector<fdb::GranuleDescription>;
GranuleDescriptionArray copyGranuleDescriptionArray(fdb::future_var::GranuleDescriptionRefArray::Type array);
using GranuleMutationArray = std::vector<fdb::GranuleMutation>;
GranuleMutationArray copyGranuleMutationArray(fdb::future_var::GranuleMutationRefArray::Type array);
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Do not support non-little-endian systems");
// Converts a little-endian encoded number into an integral type.

View File

@ -22,5 +22,6 @@ maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
initialSize = 100
numRandomOperations = 100

View File

@ -21,5 +21,6 @@ maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
initialSize = 100
numRandomOperations = 100

View File

@ -14,5 +14,6 @@ maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
initialSize = 100
numRandomOperations = 100

View File

@ -0,0 +1,24 @@
[[test]]
title = 'Blob Granule API Tenant Correctness Multi Threaded'
multiThreaded = true
buggify = true
minFdbThreads = 2
maxFdbThreads = 8
minClients = 1
maxClients = 8
minTenants = 1
maxTenants = 5
[[server]]
blob_granules_enabled = true
[[test.workload]]
name = 'ApiBlobGranuleCorrectness'
minKeyLength = 1
maxKeyLength = 64
minValueLength = 1
maxValueLength = 1000
maxKeysPerTransaction = 50
# TODO - increase initialSize and/or buggify down BG_SNAPSHOT_FILE_TARGET_BYTES to force multiple granules
initialSize = 100
numRandomOperations = 100

View File

@ -34,6 +34,8 @@
#include "SimpleOpt/SimpleOpt.h"
#include <thread>
#include <string_view>
#include <unordered_map>
#include "fdbclient/FDBOptions.g.h"
#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__))
#include <unistd.h>
@ -43,11 +45,6 @@
#error Unsupported platform
#endif
#undef ERROR
#define ERROR(name, number, description) enum { error_code_##name = number };
#include "flow/error_definitions.h"
#define API_VERSION_CLIENT_TMP_DIR 720
using namespace std::string_view_literals;
@ -59,17 +56,14 @@ enum TesterOptionId {
OPT_CONNFILE,
OPT_EXTERNAL_CLIENT_LIBRARY,
OPT_EXTERNAL_CLIENT_DIRECTORY,
OPT_DISABLE_LOCAL_CLIENT,
OPT_DISABLE_CLIENT_BYPASS,
OPT_API_VERSION,
OPT_TRANSACTION_TIMEOUT,
OPT_TRACE,
OPT_TRACE_DIR,
OPT_TMP_DIR,
OPT_IGNORE_EXTERNAL_CLIENT_FAILURES,
OPT_FAIL_INCOMPATIBLE_CLIENT,
OPT_EXPECTED_ERROR,
OPT_PRINT_STATUS
OPT_PRINT_STATUS,
OPT_NETWORK_OPTION
};
const int MIN_TESTABLE_API_VERSION = 400;
@ -81,17 +75,14 @@ CSimpleOpt::SOption TesterOptionDefs[] = //
{ OPT_CONNFILE, "--cluster-file", SO_REQ_SEP },
{ OPT_EXTERNAL_CLIENT_LIBRARY, "--external-client-library", SO_REQ_SEP },
{ OPT_EXTERNAL_CLIENT_DIRECTORY, "--external-client-dir", SO_REQ_SEP },
{ OPT_DISABLE_LOCAL_CLIENT, "--disable-local-client", SO_NONE },
{ OPT_DISABLE_CLIENT_BYPASS, "--disable-client-bypass", SO_NONE },
{ OPT_API_VERSION, "--api-version", SO_REQ_SEP },
{ OPT_TRANSACTION_TIMEOUT, "--transaction-timeout", SO_REQ_SEP },
{ OPT_TRACE, "--log", SO_NONE },
{ OPT_TRACE_DIR, "--log-dir", SO_REQ_SEP },
{ OPT_TMP_DIR, "--tmp-dir", SO_REQ_SEP },
{ OPT_IGNORE_EXTERNAL_CLIENT_FAILURES, "--ignore-external-client-failures", SO_NONE },
{ OPT_FAIL_INCOMPATIBLE_CLIENT, "--fail-incompatible-client", SO_NONE },
{ OPT_EXPECTED_ERROR, "--expected-error", SO_REQ_SEP },
{ OPT_PRINT_STATUS, "--print-status", SO_NONE },
{ OPT_NETWORK_OPTION, "--network-option-", SO_REQ_SEP },
SO_END_OF_OPTIONS };
class TesterOptions {
@ -111,6 +102,7 @@ public:
bool failIncompatibleClient = false;
fdb::Error::CodeType expectedError = 0;
bool printStatus = false;
std::vector<std::pair<std::string, std::string>> networkOptions;
};
namespace {
@ -130,10 +122,6 @@ void printProgramUsage(const char* execName) {
" Path to the external client library.\n"
" --external-client-dir DIR\n"
" Directory containing external client libraries.\n"
" --disable-local-client\n"
" Disable the local client, i.e. use only external client libraries.\n"
" --disable-client-bypass\n"
" Disable bypassing Multi-Version Client when using the local client.\n"
" --api-version VERSION\n"
" Required FDB API version (default %d).\n"
" --transaction-timeout MILLISECONDS\n"
@ -144,14 +132,12 @@ void printProgramUsage(const char* execName) {
" no effect unless --log is specified.\n"
" --tmp-dir DIR\n"
" Directory for temporary files of the client.\n"
" --ignore-external-client-failures\n"
" Ignore failures to initialize external clients.\n"
" --fail-incompatible-client\n"
" Fail if there is no client matching the server version.\n"
" --expected-error ERR\n"
" FDB error code the test expected to fail with (default: 0).\n"
" --print-status\n"
" Print database client status.\n"
" --network-option-OPTIONNAME OPTIONVALUE\n"
" Changes a network option. OPTIONAME should be lowercase.\n"
" -h, --help Display this help and exit.\n",
FDB_API_VERSION);
}
@ -170,6 +156,19 @@ bool processIntOption(const std::string& optionName, const std::string& value, i
return true;
}
// Extracts the key for command line arguments that are specified with a prefix (e.g. --knob-).
// This function converts any hyphens in the extracted key to underscores.
bool extractPrefixedArgument(std::string prefix, const std::string& arg, std::string& res) {
if (arg.size() <= prefix.size() || arg.find(prefix) != 0 ||
(arg[prefix.size()] != '-' && arg[prefix.size()] != '_')) {
return false;
}
res = arg.substr(prefix.size() + 1);
std::transform(res.begin(), res.end(), res.begin(), [](int c) { return c == '-' ? '_' : c; });
return true;
}
bool processArg(const CSimpleOpt& args) {
switch (args.OptionId()) {
case OPT_CONNFILE:
@ -181,12 +180,6 @@ bool processArg(const CSimpleOpt& args) {
case OPT_EXTERNAL_CLIENT_DIRECTORY:
options.externalClientDir = args.OptionArg();
break;
case OPT_DISABLE_LOCAL_CLIENT:
options.disableLocalClient = true;
break;
case OPT_DISABLE_CLIENT_BYPASS:
options.disableClientBypass = true;
break;
case OPT_API_VERSION:
if (!processIntOption(
args.OptionText(), args.OptionArg(), MIN_TESTABLE_API_VERSION, FDB_API_VERSION, options.apiVersion)) {
@ -207,12 +200,6 @@ bool processArg(const CSimpleOpt& args) {
case OPT_TMP_DIR:
options.tmpDir = args.OptionArg();
break;
case OPT_IGNORE_EXTERNAL_CLIENT_FAILURES:
options.ignoreExternalClientFailures = true;
break;
case OPT_FAIL_INCOMPATIBLE_CLIENT:
options.failIncompatibleClient = true;
break;
case OPT_EXPECTED_ERROR:
if (!processIntOption(args.OptionText(), args.OptionArg(), 0, 10000, options.expectedError)) {
return false;
@ -221,6 +208,16 @@ bool processArg(const CSimpleOpt& args) {
case OPT_PRINT_STATUS:
options.printStatus = true;
break;
case OPT_NETWORK_OPTION: {
std::string optionName;
if (!extractPrefixedArgument("--network-option", args.OptionSyntax(), optionName)) {
fmt::print(stderr, "ERROR: unable to parse network option '{}'\n", args.OptionSyntax());
return false;
}
options.networkOptions.emplace_back(optionName, args.OptionArg());
break;
}
}
return true;
}
@ -272,6 +269,12 @@ void fdb_check(fdb::Error e, std::string_view msg) {
}
}
std::string stringToUpper(const std::string& str) {
std::string outStr(str);
std::transform(outStr.begin(), outStr.end(), outStr.begin(), [](char c) { return std::toupper(c); });
return outStr;
}
void applyNetworkOptions() {
if (!options.tmpDir.empty() && options.apiVersion >= API_VERSION_CLIENT_TMP_DIR) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_CLIENT_TMP_DIR, options.tmpDir);
@ -283,20 +286,21 @@ void applyNetworkOptions() {
if (!options.externalClientDir.empty()) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_EXTERNAL_CLIENT_DIRECTORY, options.externalClientDir);
}
if (options.disableLocalClient) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_LOCAL_CLIENT);
}
if (options.trace) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_TRACE_ENABLE, options.traceDir);
}
if (options.ignoreExternalClientFailures) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_IGNORE_EXTERNAL_CLIENT_FAILURES);
std::unordered_map<std::string, FDBNetworkOption> networkOptionsByName;
for (auto const& [optionCode, optionInfo] : FDBNetworkOptions::optionInfo) {
networkOptionsByName[optionInfo.name] = static_cast<FDBNetworkOption>(optionCode);
}
if (options.failIncompatibleClient) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_FAIL_INCOMPATIBLE_CLIENT);
}
if (options.disableClientBypass) {
fdb::network::setOption(FDBNetworkOption::FDB_NET_OPTION_DISABLE_CLIENT_BYPASS);
for (auto const& [optionName, optionVal] : options.networkOptions) {
auto iter = networkOptionsByName.find(stringToUpper(optionName));
if (iter == networkOptionsByName.end()) {
fmt::print(stderr, "Unknown network option {}\n", optionName);
}
fdb::network::setOption(iter->second, optionVal);
}
}

View File

@ -81,6 +81,61 @@ struct GranuleSummary {
}
};
struct GranuleFilePointer {
ByteString filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
GranuleFilePointer(const native::FDBBGFilePointer& nativePointer) {
filename = fdb::Key(nativePointer.filename_ptr, nativePointer.filename_length);
offset = nativePointer.file_offset;
length = nativePointer.file_length;
fullFileLength = nativePointer.full_file_length;
}
};
struct GranuleMutation {
native::FDBBGMutationType type;
int64_t version;
ByteString param1;
ByteString param2;
GranuleMutation(const native::FDBBGMutation& nativeMutation) {
type = static_cast<native::FDBBGMutationType>(nativeMutation.type);
version = nativeMutation.version;
param1 = ByteString(nativeMutation.param1_ptr, nativeMutation.param1_length);
param2 = ByteString(nativeMutation.param2_ptr, nativeMutation.param2_length);
}
};
struct GranuleDescription {
KeyRange keyRange;
std::optional<GranuleFilePointer> snapshotFile;
std::vector<GranuleFilePointer> deltaFiles;
std::vector<GranuleMutation> memoryMutations;
GranuleDescription(const native::FDBBGFileDescription& nativeDesc) {
keyRange.beginKey = fdb::Key(nativeDesc.key_range.begin_key, nativeDesc.key_range.begin_key_length);
keyRange.endKey = fdb::Key(nativeDesc.key_range.end_key, nativeDesc.key_range.end_key_length);
if (nativeDesc.snapshot_present) {
snapshotFile = GranuleFilePointer(nativeDesc.snapshot_file_pointer);
}
if (nativeDesc.delta_file_count > 0) {
deltaFiles.reserve(nativeDesc.delta_file_count);
for (int i = 0; i < nativeDesc.delta_file_count; i++) {
deltaFiles.emplace_back(nativeDesc.delta_files[i]);
}
}
if (nativeDesc.memory_mutation_count > 0) {
memoryMutations.reserve(nativeDesc.memory_mutation_count);
for (int i = 0; i < nativeDesc.memory_mutation_count; i++) {
memoryMutations.emplace_back(nativeDesc.memory_mutations[i]);
}
}
}
};
inline uint8_t const* toBytePtr(char const* ptr) noexcept {
return reinterpret_cast<uint8_t const*>(ptr);
}
@ -246,6 +301,42 @@ struct GranuleSummaryRefArray {
}
};
// fdb_future_readbg_get_descriptions
struct GranuleDescriptionRef : native::FDBBGFileDescription {
fdb::KeyRef beginKey() const noexcept {
return fdb::KeyRef(native::FDBBGFileDescription::key_range.begin_key,
native::FDBBGFileDescription::key_range.begin_key_length);
}
fdb::KeyRef endKey() const noexcept {
return fdb::KeyRef(native::FDBBGFileDescription::key_range.end_key,
native::FDBBGFileDescription::key_range.end_key_length);
}
};
struct GranuleDescriptionRefArray {
using Type = std::tuple<GranuleDescriptionRef*, int>;
static Error extract(native::FDBFuture* f, Type& out) noexcept {
auto& [out_desc, out_count] = out;
auto err = native::fdb_future_readbg_get_descriptions(
f, reinterpret_cast<native::FDBBGFileDescription**>(&out_desc), &out_count);
return Error(err);
}
};
struct GranuleMutationRef : native::FDBBGMutation {
fdb::KeyRef param1() const noexcept {
return fdb::BytesRef(native::FDBBGMutation::param1_ptr, native::FDBBGMutation::param1_length);
}
fdb::KeyRef param2() const noexcept {
return fdb::BytesRef(native::FDBBGMutation::param2_ptr, native::FDBBGMutation::param2_length);
}
};
struct GranuleMutationRefArray {
using Type = std::tuple<GranuleMutationRef const*, int>;
};
} // namespace future_var
[[noreturn]] inline void throwError(std::string_view preamble, Error err) {
@ -335,6 +426,7 @@ class Result {
public:
using KeyValueRefArray = future_var::KeyValueRefArray::Type;
using GranuleMutationRefArray = future_var::GranuleMutationRefArray::Type;
Error getKeyValueArrayNothrow(KeyValueRefArray& out) const noexcept {
auto out_more_native = native::fdb_bool_t{};
@ -351,6 +443,20 @@ public:
throwError("ERROR: result_get_keyvalue_array(): ", err);
return ret;
}
Error getGranuleMutationArrayNothrow(GranuleMutationRefArray& out) const noexcept {
auto& [out_mutations, out_count] = out;
auto err_raw = native::fdb_result_get_bg_mutations_array(
r.get(), reinterpret_cast<const native::FDBBGMutation**>(&out_mutations), &out_count);
return Error(err_raw);
}
GranuleMutationRefArray getGranuleMutationArray() const {
auto ret = GranuleMutationRefArray{};
if (auto err = getGranuleMutationArrayNothrow(ret))
throwError("ERROR: result_get_keyvalue_array(): ", err);
return ret;
}
};
class Future {
@ -640,6 +746,29 @@ public:
return native::fdb_transaction_watch(tr.get(), key.data(), intSize(key));
}
TypedFuture<future_var::GranuleDescriptionRefArray> readBlobGranulesDescription(KeyRef begin,
KeyRef end,
int64_t beginVersion,
int64_t readVersion,
int64_t* readVersionOut) {
return native::fdb_transaction_read_blob_granules_description(tr.get(),
begin.data(),
intSize(begin),
end.data(),
intSize(end),
beginVersion,
readVersion,
readVersionOut);
}
Result parseSnapshotFile(BytesRef fileData) {
return Result(native::fdb_readbg_parse_snapshot_file(fileData.data(), intSize(fileData)));
}
Result parseDeltaFile(BytesRef fileData) {
return Result(native::fdb_readbg_parse_delta_file(fileData.data(), intSize(fileData)));
}
TypedFuture<future_var::None> commit() { return native::fdb_transaction_commit(tr.get()); }
TypedFuture<future_var::None> onError(Error err) { return native::fdb_transaction_on_error(tr.get(), err.code()); }

View File

@ -8,6 +8,7 @@ import os
import glob
import unittest
import json
import re
from threading import Thread
import time
@ -99,6 +100,9 @@ class ClientConfigTest:
self.expected_error = None
self.transaction_timeout = None
self.print_status = False
self.trace_file_identifier = None
self.trace_initialize_on_setup = False
self.trace_format = None
# ----------------------------
# Configuration methods
@ -208,6 +212,9 @@ class ClientConfigTest:
self.tc.assertTrue("Healthy" in self.status_json)
self.tc.assertEqual(expected_is_healthy, self.status_json["Healthy"])
def list_trace_files(self):
return glob.glob(os.path.join(self.log_dir, "*"))
# ----------------------------
# Executing the test
# ----------------------------
@ -222,10 +229,10 @@ class ClientConfigTest:
cmd_args += ["--log", "--log-dir", self.log_dir]
if self.disable_local_client:
cmd_args += ["--disable-local-client"]
cmd_args += ["--network-option-disable_local_client", ""]
if self.disable_client_bypass:
cmd_args += ["--disable-client-bypass"]
cmd_args += ["--network-option-disable_client_bypass", ""]
if self.external_lib_path is not None:
cmd_args += ["--external-client-library", self.external_lib_path]
@ -234,10 +241,19 @@ class ClientConfigTest:
cmd_args += ["--external-client-dir", self.external_lib_dir]
if self.ignore_external_client_failures:
cmd_args += ["--ignore-external-client-failures"]
cmd_args += ["--network-option-ignore_external_client_failures", ""]
if self.fail_incompatible_client:
cmd_args += ["--fail-incompatible-client"]
cmd_args += ["--network-option-fail_incompatible_client", ""]
if self.trace_file_identifier is not None:
cmd_args += ["--network-option-trace_file_identifier", self.trace_file_identifier]
if self.trace_initialize_on_setup:
cmd_args += ["--network-option-trace_initialize_on_setup", ""]
if self.trace_format is not None:
cmd_args += ["--network-option-trace_format", self.trace_format]
if self.api_version is not None:
cmd_args += ["--api-version", str(self.api_version)]
@ -252,26 +268,20 @@ class ClientConfigTest:
cmd_args += ["--print-status"]
print("\nExecuting test command: {}".format(" ".join([str(c) for c in cmd_args])), file=sys.stderr)
try:
tester_proc = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=sys.stderr)
out, _ = tester_proc.communicate()
self.tc.assertEqual(0, tester_proc.returncode)
if self.print_status:
# Parse the output as status json
try:
self.status_json = json.loads(out)
except json.JSONDecodeError as e:
print("Error '{}' parsing output {}".format(e, out.decode()), file=sys.stderr)
self.tc.assertIsNotNone(self.status_json)
print("Status: ", self.status_json, file=sys.stderr)
else:
# Otherwise redirect the output to the console
print(out.decode(), file=sys.stderr)
finally:
self.cleanup()
def cleanup(self):
shutil.rmtree(self.test_dir)
tester_proc = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=sys.stderr)
out, _ = tester_proc.communicate()
self.tc.assertEqual(0, tester_proc.returncode)
if self.print_status:
# Parse the output as status json
try:
self.status_json = json.loads(out)
except json.JSONDecodeError as e:
print("Error '{}' parsing output {}".format(e, out.decode()), file=sys.stderr)
self.tc.assertIsNotNone(self.status_json)
print("Status: ", self.status_json, file=sys.stderr)
else:
# Otherwise redirect the output to the console
print(out.decode(), file=sys.stderr)
class ClientConfigTests(unittest.TestCase):
@ -516,6 +526,171 @@ class ClientConfigSeparateCluster(unittest.TestCase):
self.cluster.tear_down()
# Test client-side tracing
class ClientTracingTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.cluster = TestCluster(CURRENT_VERSION)
cls.cluster.setup()
@classmethod
def tearDownClass(cls):
cls.cluster.tear_down()
def test_default_config_normal_case(self):
# Test trace files created with a default trace configuration
# in a normal case
test = self.test
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
test.api_version = api_version_from_str(PREV_RELEASE_VERSION)
test.disable_local_client = True
self.exec_test()
self.assertEqual(3, len(self.trace_files))
primary_trace = self.find_trace_file(with_ip=True)
self.find_and_check_event(primary_trace, "ClientStart", ["Machine"], [])
cur_ver_trace = self.find_trace_file(with_ip=True, version=CURRENT_VERSION, thread_idx=0)
self.find_and_check_event(cur_ver_trace, "ClientStart", ["Machine"], [])
prev_ver_trace = self.find_trace_file(with_ip=True, version=PREV_RELEASE_VERSION, thread_idx=0)
self.find_and_check_event(prev_ver_trace, "ClientStart", ["Machine"], [])
def test_default_config_error_case(self):
# Test that no trace files are created with a default configuration
# when an a client fails to initialize
test = self.test
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
test.api_version = api_version_from_str(CURRENT_VERSION)
test.disable_local_client = True
test.expected_error = 2204 # API function missing
self.exec_test()
self.assertEqual(0, len(self.trace_files))
def test_init_on_setup_normal_case(self):
# Test trace files created with trace_initialize_on_setup option
# in a normal case
test = self.test
test.create_external_lib_dir([CURRENT_VERSION])
test.api_version = api_version_from_str(CURRENT_VERSION)
test.disable_local_client = True
test.trace_initialize_on_setup = True
self.exec_test()
self.assertEqual(2, len(self.trace_files))
primary_trace = self.find_trace_file()
# The machine address will be available only in the second ClientStart event
self.find_and_check_event(primary_trace, "ClientStart", [], ["Machine"])
self.find_and_check_event(primary_trace, "ClientStart", ["Machine"], [], seqno=1)
cur_ver_trace = self.find_trace_file(version=CURRENT_VERSION, thread_idx=0)
self.find_and_check_event(cur_ver_trace, "ClientStart", [], ["Machine"])
self.find_and_check_event(cur_ver_trace, "ClientStart", ["Machine"], [], seqno=1)
def test_init_on_setup_trace_error_case(self):
# Test trace files created with trace_initialize_on_setup option
# when an a client fails to initialize
test = self.test
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION])
test.api_version = api_version_from_str(CURRENT_VERSION)
test.disable_local_client = True
test.trace_initialize_on_setup = True
test.expected_error = 2204 # API function missing
self.exec_test()
self.assertEqual(1, len(self.trace_files))
primary_trace = self.find_trace_file()
self.find_and_check_event(primary_trace, "ClientStart", [], ["Machine"])
def test_trace_identifier(self):
# Test trace files created with file identifier
test = self.test
test.create_external_lib_dir([CURRENT_VERSION])
test.api_version = api_version_from_str(CURRENT_VERSION)
test.disable_local_client = True
test.trace_file_identifier = "fdbclient"
self.exec_test()
self.assertEqual(2, len(self.trace_files))
self.find_trace_file(with_ip=True, identifier="fdbclient")
self.find_trace_file(with_ip=True, identifier="fdbclient", version=CURRENT_VERSION, thread_idx=0)
def test_init_on_setup_and_trace_identifier(self):
# Test trace files created with trace_initialize_on_setup option
# and file identifier
test = self.test
test.create_external_lib_dir([CURRENT_VERSION])
test.api_version = api_version_from_str(CURRENT_VERSION)
test.disable_local_client = True
test.trace_initialize_on_setup = True
test.trace_file_identifier = "fdbclient"
self.exec_test()
self.assertEqual(2, len(self.trace_files))
self.find_trace_file(identifier="fdbclient")
self.find_trace_file(identifier="fdbclient", version=CURRENT_VERSION, thread_idx=0)
# ---------------
# Helper methods
# ---------------
def setUp(self):
self.test = ClientConfigTest(self)
self.trace_files = None
self.test.trace_format = "json"
def exec_test(self):
self.test.exec()
self.trace_files = self.test.list_trace_files()
if self.test.trace_format == "json":
self.load_trace_file_events()
def load_trace_file_events(self):
self.trace_file_events = {}
for trace in self.trace_files:
events = []
with open(trace, "r") as f:
for line in f:
events.append(json.loads(line))
self.trace_file_events[trace] = events
def find_trace_file(self, with_ip=False, identifier=None, version=None, thread_idx=None):
self.assertIsNotNone(self.trace_files)
for trace_file in self.trace_files:
name = os.path.basename(trace_file)
# trace prefix must be in all files
self.assertTrue(name.startswith("trace."))
pattern = "^trace\."
if with_ip:
pattern += "127\.0\.0\.1\."
else:
pattern += "0\.0\.0\.0\."
if identifier is not None:
pattern += identifier
else:
pattern += "\d+"
if version is not None:
pattern += "_v{}".format(version.replace(".", "_"))
if thread_idx is not None:
pattern += "t{}".format(thread_idx)
pattern += "\.\d+\.\w+\.\d+\.\d+\.{}$".format(self.test.trace_format)
if re.match(pattern, name):
return trace_file
self.fail("No maching trace file found")
def find_and_check_event(self, trace_file, event_type, attr_present, attr_missing, seqno=0):
self.assertTrue(trace_file in self.trace_file_events)
for event in self.trace_file_events[trace_file]:
if event["Type"] == event_type:
if seqno > 0:
seqno -= 1
continue
for attr in attr_present:
self.assertTrue(attr in event)
for attr in attr_missing:
self.assertFalse(attr in event)
return
self.fail("No matching event found")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from pathlib import Path
import platform
import shutil
import subprocess
import sys
@ -53,7 +52,7 @@ class TestEnv(LocalCluster):
self.downloader.binary_path(version, "fdbcli"),
1,
)
self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version))
self.set_env_var("LD_LIBRARY_PATH", "%s:%s" % (self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH")))
client_lib = self.downloader.lib_path(version)
assert client_lib.exists(), "{} does not exist".format(client_lib)
self.client_lib_external = self.tmp_dir.joinpath("libfdb_c_external.so")
@ -91,9 +90,8 @@ class FdbCShimTests:
self.api_test_dir = Path(args.api_test_dir).resolve()
assert self.api_test_dir.exists(), "{} does not exist".format(self.api_test_dir)
self.downloader = FdbBinaryDownloader(args.build_dir)
# binary downloads are currently available only for x86_64
self.platform = platform.machine()
if self.platform == "x86_64":
self.test_prev_versions = not args.disable_prev_version_tests
if self.test_prev_versions:
self.downloader.download_old_binaries(PREV_RELEASE_VERSION)
self.downloader.download_old_binaries("7.0.0")
@ -182,7 +180,8 @@ class FdbCShimTests:
if use_external_lib:
cmd_args = cmd_args + ["--disable-local-client", "--external-client-library", test_env.client_lib_external]
env_vars = os.environ.copy()
env_vars["LD_LIBRARY_PATH"] = self.downloader.lib_dir(version) if set_ld_lib_path else ""
if set_ld_lib_path:
env_vars["LD_LIBRARY_PATH"] = "%s:%s" % (self.downloader.lib_dir(version), os.getenv("LD_LIBRARY_PATH"))
if set_env_path:
env_vars["FDB_LOCAL_CLIENT_LIBRARY_PATH"] = (
"dummy" if invalid_lib_path else self.downloader.lib_path(version)
@ -230,8 +229,7 @@ class FdbCShimTests:
# Test calling a function that exists in the loaded library, but not for the selected API version
self.run_c_shim_lib_tester(CURRENT_VERSION, test_env, call_set_path=True, api_version=700)
# binary downloads are currently available only for x86_64
if self.platform == "x86_64":
if self.test_prev_versions:
# Test the API workload with the release version
self.run_c_api_test(PREV_RELEASE_VERSION, DEFAULT_TEST_FILE)
@ -283,6 +281,12 @@ if __name__ == "__main__":
parser.add_argument(
"--api-test-dir", type=str, help="Path to a directory with api test definitions.", required=True
)
parser.add_argument(
"--disable-prev-version-tests",
action="store_true",
default=False,
help="Disable tests that need binaries of previous versions",
)
args = parser.parse_args()
test = FdbCShimTests(args)
test.run_tests()

View File

@ -87,7 +87,7 @@ void fdb_flow_test() {
g_network = newNet2(TLSConfig());
openTraceFile(NetworkAddress(), 1000000, 1000000, ".");
openTraceFile({}, 1000000, 1000000, ".");
systemMonitor();
uncancellable(recurring(&systemMonitor, 5.0, TaskPriority::FlushTrace));

View File

@ -107,6 +107,11 @@ func (o NetworkOptions) SetTraceShareAmongClientThreads() error {
return o.setOpt(37, nil)
}
// Initialize trace files on network setup, determine the local IP later. Otherwise tracing is initialized when opening the first database.
func (o NetworkOptions) SetTraceInitializeOnSetup() error {
return o.setOpt(38, nil)
}
// Set file suffix for partially written log files.
//
// Parameter: Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension.
@ -422,9 +427,11 @@ func (o DatabaseOptions) SetUseConfigDatabase() error {
return o.setOpt(800, nil)
}
// An integer between 0 and 100 (default is 0) expressing the probability that a client will verify it can't read stale data whenever it detects a recovery.
func (o DatabaseOptions) SetTestCausalReadRisky() error {
return o.setOpt(900, nil)
// Enables verification of causal read risky by checking whether clients are able to read stale data when they detect a recovery, and logging an error if so.
//
// Parameter: integer between 0 and 100 expressing the probability a client will verify it can't read stale data
func (o DatabaseOptions) SetTestCausalReadRisky(param int64) error {
return o.setOpt(900, int64ToBytes(param))
}
// The transaction, if not self-conflicting, may be committed a second time after commit succeeds, in the event of a fault

View File

@ -505,15 +505,27 @@ public class AsyncStackTester {
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_SET_ACTIVE) {
return inst.popParam().thenAcceptAsync(param -> {
return inst.popParam().thenComposeAsync(param -> {
byte[] tenantName = (byte[])param;
inst.context.setTenant(Optional.of(tenantName));
return inst.context.setTenant(Optional.of(tenantName)).thenAcceptAsync(id -> {
inst.push("SET_ACTIVE_TENANT".getBytes());
}, FDB.DEFAULT_EXECUTOR);
}, FDB.DEFAULT_EXECUTOR);
}
else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
inst.context.setTenant(Optional.empty());
return AsyncUtil.DONE;
}
else if (op == StackOperation.TENANT_GET_ID) {
if (inst.context.tenant.isPresent()) {
return inst.context.tenant.get().getId().thenAcceptAsync(id -> {
inst.push("GOT_TENANT_ID".getBytes());
}, FDB.DEFAULT_EXECUTOR);
} else {
inst.push("NO_ACTIVE_TENANT".getBytes());
return AsyncUtil.DONE;
}
}
else if (op == StackOperation.UNIT_TESTS) {
inst.context.db.options().setLocationCacheSize(100001);
return inst.context.db.runAsync(tr -> {

View File

@ -101,12 +101,14 @@ abstract class Context implements Runnable, AutoCloseable {
}
}
public synchronized void setTenant(Optional<byte[]> tenantName) {
public synchronized CompletableFuture<Long> setTenant(Optional<byte[]> tenantName) {
if (tenantName.isPresent()) {
tenant = Optional.of(tenantMap.computeIfAbsent(tenantName.get(), tn -> db.openTenant(tenantName.get())));
return tenant.get().getId();
}
else {
tenant = Optional.empty();
return CompletableFuture.completedFuture(-1L);
}
}

View File

@ -79,6 +79,7 @@ enum StackOperation {
TENANT_LIST,
TENANT_SET_ACTIVE,
TENANT_CLEAR_ACTIVE,
TENANT_GET_ID,
LOG_STACK
}

View File

@ -450,11 +450,20 @@ public class StackTester {
}
else if (op == StackOperation.TENANT_SET_ACTIVE) {
byte[] tenantName = (byte[])inst.popParam().join();
inst.context.setTenant(Optional.of(tenantName));
inst.context.setTenant(Optional.of(tenantName)).join();
inst.push("SET_ACTIVE_TENANT".getBytes());
}
else if (op == StackOperation.TENANT_CLEAR_ACTIVE) {
inst.context.setTenant(Optional.empty());
}
else if (op == StackOperation.TENANT_GET_ID) {
if (inst.context.tenant.isPresent()) {
inst.context.tenant.get().getId().join();
inst.push("GOT_TENANT_ID".getBytes());
} else {
inst.push("NO_ACTIVE_TENANT".getBytes());
}
}
else if (op == StackOperation.UNIT_TESTS) {
try {
inst.context.db.options().setLocationCacheSize(100001);

View File

@ -1713,6 +1713,9 @@ def init_c_api():
_capi.fdb_tenant_destroy.argtypes = [ctypes.c_void_p]
_capi.fdb_tenant_destroy.restype = None
_capi.fdb_tenant_get_id.argtypes = [ctypes.c_void_p]
_capi.fdb_tenant_get_id.restype = ctypes.c_void_p
_capi.fdb_tenant_create_transaction.argtypes = [
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_void_p),

View File

@ -603,6 +603,8 @@ class Tester:
elif inst.op == six.u("TENANT_SET_ACTIVE"):
name = inst.pop()
self.tenant = self.db.open_tenant(name)
self.tenant.get_id().wait()
inst.push(b"SET_ACTIVE_TENANT")
elif inst.op == six.u("TENANT_CLEAR_ACTIVE"):
self.tenant = None
elif inst.op == six.u("TENANT_LIST"):
@ -618,6 +620,12 @@ class Tester:
except (json.decoder.JSONDecodeError, KeyError):
assert False, "Invalid Tenant Metadata"
inst.push(fdb.tuple.pack(tuple(result)))
elif inst.op == six.u("TENANT_GET_ID"):
if self.tenant != None:
self.tenant.get_id().wait()
inst.push(b"GOT_TENANT_ID")
else:
inst.push(b"NO_ACTIVE_TENANT")
elif inst.op == six.u("UNIT_TESTS"):
try:
test_db_options(db)

View File

@ -380,8 +380,127 @@ function(create_valgrind_correctness_package)
endif()
endfunction()
function(prepare_binding_test_files build_directory target_name target_dependency)
add_custom_target(${target_name} DEPENDS ${target_dependency})
add_custom_command(
TARGET ${target_name}
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:fdb_flow_tester> ${build_directory}/tests/flow/bin/fdb_flow_tester
COMMENT "Copy Flow tester for bindingtester")
set(generated_binding_files python/fdb/fdboptions.py)
if(WITH_JAVA_BINDING)
if(NOT FDB_RELEASE)
set(not_fdb_release_string "-SNAPSHOT")
else()
set(not_fdb_release_string "")
endif()
add_custom_command(
TARGET ${target_name}
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/packages/fdb-java-${FDB_VERSION}${not_fdb_release_string}.jar
${build_directory}/tests/java/foundationdb-client.jar
COMMENT "Copy Java bindings for bindingtester")
add_dependencies(${target_name} fat-jar)
add_dependencies(${target_name} foundationdb-tests)
set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
endif()
if(WITH_GO_BINDING)
add_dependencies(${target_name} fdb_go_tester fdb_go)
add_custom_command(
TARGET ${target_name}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/go/bin/_stacktester ${build_directory}/tests/go/build/bin/_stacktester
COMMAND ${CMAKE_COMMAND} -E make_directory ${build_directory}/tests/go/src/fdb/
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bindings/go/src/github.com/apple/foundationdb/bindings/go/src/fdb/generated.go # SRC
${build_directory}/tests/go/src/fdb/ # DEST
COMMENT "Copy generated.go for bindingtester")
endif()
foreach(generated IN LISTS generated_binding_files)
add_custom_command(
TARGET ${target_name}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/${generated} ${build_directory}/tests/${generated}
COMMENT "Copy ${generated} to bindingtester")
endforeach()
endfunction(prepare_binding_test_files)
function(package_bindingtester2)
if (WIN32 OR OPEN_FOR_IDE)
message(WARNING "Binding tester is not built (WIN32/OPEN_FOR_IDE)")
return()
endif()
set(fdbcName "libfdb_c.so")
if (APPLE)
set(fdbcName "libfdb_c.dylib")
endif ()
set(touch_file ${CMAKE_BINARY_DIR}/bindingtester2.touch)
set(build_directory ${CMAKE_BINARY_DIR}/bindingtester2)
set(tests_directory ${build_directory}/tests)
add_custom_command(
OUTPUT ${touch_file}
COMMAND ${CMAKE_COMMAND} -E remove_directory ${build_directory}
COMMAND ${CMAKE_COMMAND} -E make_directory ${build_directory}
COMMAND ${CMAKE_COMMAND} -E remove_directory ${tests_directory}
COMMAND ${CMAKE_COMMAND} -E make_directory ${tests_directory}
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/bindings ${tests_directory}
COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_BINARY_DIR}/bindingtester2.touch"
COMMENT "Setup scratch directory for bindingtester2")
set(joshua_directory ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts)
set(output_files
${build_directory}/joshua_test
${build_directory}/joshua_timeout
${build_directory}/fdbcli
${build_directory}/fdbserver
${build_directory}/${fdbcName}
)
add_custom_command(
OUTPUT ${output_files}
DEPENDS strip_only_fdbcli
strip_only_fdbserver
strip_only_fdb_c
${joshua_directory}/binding_test_start.sh
${joshua_directory}/binding_test_timeout.sh
${touch_file}
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/packages/bin/fdbcli
${CMAKE_BINARY_DIR}/packages/bin/fdbserver
${CMAKE_BINARY_DIR}/packages/lib/${fdbcName}
${build_directory}
COMMAND ${CMAKE_COMMAND} -E copy ${joshua_directory}/binding_test_start.sh ${build_directory}/joshua_test
COMMAND ${CMAKE_COMMAND} -E copy ${joshua_directory}/binding_test_timeout.sh ${build_directory}/joshua_timeout
COMMENT "Copy executables and scripts to bindingtester2 dir")
set(local_cluster_files ${build_directory}/local_cluster)
set(local_cluster_directory ${CMAKE_SOURCE_DIR}/contrib/local_cluster)
add_custom_command(
OUTPUT ${local_cluster_files}
COMMAND ${CMAKE_COMMAND} -E copy_directory
${local_cluster_directory}
${build_directory}
)
prepare_binding_test_files(${build_directory} copy_bindingtester2_test_files ${touch_file})
set(tar_file ${CMAKE_BINARY_DIR}/packages/bindingtester2-${FDB_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
DEPENDS ${touch_file} ${output_files} ${local_cluster_files} copy_bindingtester2_test_files
COMMAND ${CMAKE_COMMAND} -E tar czf ${tar_file} *
WORKING_DIRECTORY ${build_directory}
COMMENT "Pack bindingtester2"
)
add_custom_target(bindingtester2 ALL DEPENDS ${tar_file})
endfunction(package_bindingtester2)
function(package_bindingtester)
if(WIN32 OR OPEN_FOR_IDE)
message(WARNING "Binding tester is not built (WIN32/OPEN_FOR_IDE)")
return()
elseif(APPLE)
set(fdbcName "libfdb_c.dylib")
@ -403,7 +522,6 @@ function(package_bindingtester)
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/localClusterStart.sh ${bdir}/localClusterStart.sh
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/contrib/Joshua/scripts/bindingTestScript.sh ${bdir}/bindingTestScript.sh
COMMENT "Copy executables and scripts to bindingtester dir")
file(GLOB_RECURSE test_files ${CMAKE_SOURCE_DIR}/bindings/*)
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/bindingtester.touch"
COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/bindingtester/tests
@ -412,60 +530,19 @@ function(package_bindingtester)
COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_BINARY_DIR}/bindingtester.touch"
COMMENT "Copy test files for bindingtester")
add_custom_target(copy_binding_output_files DEPENDS ${CMAKE_BINARY_DIR}/bindingtester.touch python_binding fdb_flow_tester)
add_custom_command(
TARGET copy_binding_output_files
COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:fdb_flow_tester> ${bdir}/tests/flow/bin/fdb_flow_tester
COMMENT "Copy Flow tester for bindingtester")
set(generated_binding_files python/fdb/fdboptions.py)
if(WITH_JAVA_BINDING)
if(NOT FDB_RELEASE)
set(not_fdb_release_string "-SNAPSHOT")
else()
set(not_fdb_release_string "")
endif()
add_custom_command(
TARGET copy_binding_output_files
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/packages/fdb-java-${FDB_VERSION}${not_fdb_release_string}.jar
${bdir}/tests/java/foundationdb-client.jar
COMMENT "Copy Java bindings for bindingtester")
add_dependencies(copy_binding_output_files fat-jar)
add_dependencies(copy_binding_output_files foundationdb-tests)
set(generated_binding_files ${generated_binding_files} java/foundationdb-tests.jar)
endif()
if(WITH_GO_BINDING AND NOT OPEN_FOR_IDE)
add_dependencies(copy_binding_output_files fdb_go_tester fdb_go)
add_custom_command(
TARGET copy_binding_output_files
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/go/bin/_stacktester ${bdir}/tests/go/build/bin/_stacktester
COMMAND ${CMAKE_COMMAND} -E make_directory ${bdir}/tests/go/src/fdb/
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/bindings/go/src/github.com/apple/foundationdb/bindings/go/src/fdb/generated.go # SRC
${bdir}/tests/go/src/fdb/ # DEST
COMMENT "Copy generated.go for bindingtester")
endif()
foreach(generated IN LISTS generated_binding_files)
add_custom_command(
TARGET copy_binding_output_files
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bindings/${generated} ${bdir}/tests/${generated}
COMMENT "Copy ${generated} to bindingtester")
endforeach()
prepare_binding_test_files(${bdir} copy_binding_output_files ${CMAKE_BINARY_DIR}/bindingtester.touch)
add_custom_target(copy_bindingtester_binaries
DEPENDS ${outfiles} "${CMAKE_BINARY_DIR}/bindingtester.touch" copy_binding_output_files)
add_dependencies(copy_bindingtester_binaries strip_only_fdbserver strip_only_fdbcli strip_only_fdb_c)
set(tar_file ${CMAKE_BINARY_DIR}/packages/bindingtester-${FDB_VERSION}.tar.gz)
add_custom_command(
OUTPUT ${tar_file}
COMMAND ${CMAKE_COMMAND} -E tar czf ${tar_file} *
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/bindingtester
COMMENT "Pack bindingtester")
add_custom_target(bindingtester ALL DEPENDS ${tar_file})
add_dependencies(bindingtester copy_bindingtester_binaries)
add_custom_target(bindingtester ALL DEPENDS ${tar_file} copy_bindingtester_binaries)
endfunction()
# Test for setting up Python venv for client tests.

View File

@ -1,4 +1,3 @@
# RPM specifics
if(CPACK_GENERATOR MATCHES "RPM")
set(CPACK_PACKAGING_INSTALL_PREFIX "/")
set(CPACK_COMPONENTS_ALL clients-el7 server-el7 clients-versioned server-versioned)

View File

@ -46,27 +46,17 @@ where `TARGET` can be any of
* aarch64-linux-gnu, aarch64-none-linux-android
* e2k-linux-gnu
Script generates two files: `libxyz.so.tramp.S` and `libxyz.so.init.c` which need to be linked to your application (instead of `-lxyz`):
Script generates two files: `libxyz.so.tramp.S` and `libxyz.so.init.cpp` which need to be linked to your application (instead of `-lxyz`):
```
$ gcc myfile1.c myfile2.c ... libxyz.so.tramp.S libxyz.so.init.c ... -ldl
$ gcc myfile1.c myfile2.c ... libxyz.so.tramp.S libxyz.so.init.cpp ... -ldl
```
Note that you need to link against libdl.so. On ARM in case your app is compiled to Thumb code (which e.g. Ubuntu's `arm-linux-gnueabihf-gcc` does by default) you'll also need to add `-mthumb-interwork`.
Application can then freely call functions from `libxyz.so` _without linking to it_. Library will be loaded (via `dlopen`) on first call to any of its functions. If you want to forcedly resolve all symbols (e.g. if you want to avoid delays further on) you can call `void libxyz_init_all()`.
Above command would perform a _lazy load_ i.e. load library on first call to one of it's symbols. If you want to load it at startup, run
```
$ implib-gen.py --no-lazy-load libxyz.so
```
If you don't want `dlopen` to be called automatically and prefer to load library yourself at program startup, run script as
```
$ implib-gen.py --no-dlopen libxys.so
```
Above command would perform a _lazy load_ i.e. load library on first call to one of it's symbols.
If you do want to load library via `dlopen` but would prefer to call it yourself (e.g. with custom parameters or with modified library name), run script as
@ -100,10 +90,6 @@ $ implib-gen.py --dlopen-callback=mycallback libxyz.so
(callback must have signature `void *(*)(const char *lib_name)` and return handle of loaded library).
Finally to force library load and resolution of all symbols, call
void _LIBNAME_tramp_resolve_all(void);
# Wrapping vtables
By default the tool does not try to wrap vtables exported from the library. This can be enabled via `--vtables` flag:
@ -141,7 +127,7 @@ void *mycallback(const char *lib_name) {
}
$ implib-gen.py --dlopen-callback=mycallback --symbol-list=mysymbols.txt libxyz.so
$ ... # Link your app with libxyz.tramp.S, libxyz.init.c and mycallback.c
$ ... # Link your app with libxyz.tramp.S, libxyz.init.cpp and mycallback.c
```
Similar approach can be used if you want to provide a common interface for several libraries with partially intersecting interfaces (see [this example](tests/multilib/run.sh) for more details).
@ -156,7 +142,7 @@ To achieve this you can generate a wrapper with _renamed_ symbols which call to
$ cat mycallback.c
... Same as before ...
$ implib-gen.py --dlopen-callback=mycallback --symbol_prefix=MYPREFIX_ libxyz.so
$ ... # Link your app with libxyz.tramp.S, libxyz.init.c and mycallback.c
$ ... # Link your app with libxyz.tramp.S, libxyz.init.cpp and mycallback.c
```
# Linker wrapper

View File

@ -11,6 +11,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <mutex>
// Sanity check for ARM to avoid puzzling runtime crashes
#ifdef __arm__
@ -31,22 +32,15 @@ extern "C" {
} while(0)
#define CALL_USER_CALLBACK $has_dlopen_callback
#define NO_DLOPEN $no_dlopen
#define LAZY_LOAD $lazy_load
static void *lib_handle;
static int is_lib_loading;
static void *load_library() {
if(lib_handle)
return lib_handle;
is_lib_loading = 1;
// TODO: dlopen and users callback must be protected w/ critical section (to avoid dlopening lib twice)
#if NO_DLOPEN
CHECK(0, "internal error"); // We shouldn't get here
#elif CALL_USER_CALLBACK
#if CALL_USER_CALLBACK
extern void *$dlopen_callback(const char *lib_name);
lib_handle = $dlopen_callback("$load_name");
CHECK(lib_handle, "callback '$dlopen_callback' failed to load library");
@ -55,17 +49,9 @@ static void *load_library() {
CHECK(lib_handle, "failed to load library: %s", dlerror());
#endif
is_lib_loading = 0;
return lib_handle;
}
#if ! NO_DLOPEN && ! LAZY_LOAD
static void __attribute__((constructor)) load_lib() {
load_library();
}
#endif
static void __attribute__((destructor)) unload_lib() {
if(lib_handle)
dlclose(lib_handle);
@ -79,34 +65,35 @@ static const char *const sym_names[] = {
extern void *_${lib_suffix}_tramp_table[];
// Can be sped up by manually parsing library symtab...
void _${lib_suffix}_tramp_resolve(int i) {
assert((unsigned)i + 1 < sizeof(sym_names) / sizeof(sym_names[0]));
// Load library and resolve all symbols
static void load_and_resolve(void) {
static std::mutex load_mutex;
static int is_loaded = false;
CHECK(!is_lib_loading, "library function '%s' called during library load", sym_names[i]);
std::unique_lock<std::mutex> lock(load_mutex);
if (is_loaded)
return;
void *h = 0;
#if NO_DLOPEN
// FIXME: instead of RTLD_NEXT we should search for loaded lib_handle
// as in https://github.com/jethrogb/ssltrace/blob/bf17c150a7/ssltrace.cpp#L74-L112
h = RTLD_NEXT;
#elif LAZY_LOAD
h = load_library();
#else
h = lib_handle;
CHECK(h, "failed to resolve symbol '%s', library failed to load", sym_names[i]);
#endif
// Dlsym is thread-safe so don't need to protect it.
_${lib_suffix}_tramp_table[i] = dlsym(h, sym_names[i]);
CHECK(_${lib_suffix}_tramp_table[i], "failed to resolve symbol '%s'", sym_names[i]);
}
// Helper for user to resolve all symbols
void _${lib_suffix}_tramp_resolve_all(void) {
size_t i;
for(i = 0; i + 1 < sizeof(sym_names) / sizeof(sym_names[0]); ++i)
_${lib_suffix}_tramp_resolve(i);
// Resolving some of the symbols may fail. We ignore it, because if we are loading
// a library of an older version it may lack certain functions
_${lib_suffix}_tramp_table[i] = dlsym(h, sym_names[i]);
is_loaded = true;
}
// The function is called if the table entry for the symbol is not set.
// In that case we load the library and try to resolve all symbols if that was not done yet.
// If the table entry is still missing, then the symbol is not available in the loaded library,
// which is a fatal error on which we immediately exit the process.
void _${lib_suffix}_tramp_resolve(int i) {
assert((unsigned)i + 1 < sizeof(sym_names) / sizeof(sym_names[0]));
load_and_resolve();
CHECK(_${lib_suffix}_tramp_table[i], "failed to resolve symbol '%s'", sym_names[i]);
}
#ifdef __cplusplus

View File

@ -22,532 +22,530 @@ import configparser
me = os.path.basename(__file__)
root = os.path.dirname(__file__)
def warn(msg):
"""Emits a nicely-decorated warning."""
sys.stderr.write(f'{me}: warning: {msg}\n')
"""Emits a nicely-decorated warning."""
sys.stderr.write(f"{me}: warning: {msg}\n")
def error(msg):
"""Emits a nicely-decorated error and exits."""
sys.stderr.write(f'{me}: error: {msg}\n')
sys.exit(1)
"""Emits a nicely-decorated error and exits."""
sys.stderr.write(f"{me}: error: {msg}\n")
sys.exit(1)
def run(args, stdin=""):
"""Runs external program and aborts on error."""
env = os.environ.copy()
# Force English language
env["LC_ALL"] = "c"
try:
del env["LANG"]
except KeyError:
pass
with subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) as p:
out, err = p.communicate(input=stdin.encode("utf-8"))
out = out.decode("utf-8")
err = err.decode("utf-8")
if p.returncode != 0 or err:
error(f"{args[0]} failed with retcode {p.returncode}:\n{err}")
return out, err
def run(args, stdin=''):
"""Runs external program and aborts on error."""
env = os.environ.copy()
# Force English language
env['LC_ALL'] = 'c'
try:
del env["LANG"]
except KeyError:
pass
with subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, env=env) as p:
out, err = p.communicate(input=stdin.encode('utf-8'))
out = out.decode('utf-8')
err = err.decode('utf-8')
if p.returncode != 0 or err:
error(f"{args[0]} failed with retcode {p.returncode}:\n{err}")
return out, err
def make_toc(words, renames=None):
"Make an mapping of words to their indices in list"
renames = renames or {}
toc = {}
for i, n in enumerate(words):
name = renames.get(n, n)
toc[i] = name
return toc
"Make an mapping of words to their indices in list"
renames = renames or {}
toc = {}
for i, n in enumerate(words):
name = renames.get(n, n)
toc[i] = name
return toc
def parse_row(words, toc, hex_keys):
"Make a mapping from column names to values"
vals = {k: (words[i] if i < len(words) else '') for i, k in toc.items()}
for k in hex_keys:
if vals[k]:
vals[k] = int(vals[k], 16)
return vals
"Make a mapping from column names to values"
vals = {k: (words[i] if i < len(words) else "") for i, k in toc.items()}
for k in hex_keys:
if vals[k]:
vals[k] = int(vals[k], 16)
return vals
def collect_syms(f):
"""Collect ELF dynamic symtab."""
"""Collect ELF dynamic symtab."""
# --dyn-syms does not always work for some reason so dump all symtabs
out, _ = run(['readelf', '-sW', f])
# --dyn-syms does not always work for some reason so dump all symtabs
out, _ = run(["readelf", "-sW", f])
toc = None
syms = []
syms_set = set()
for line in out.splitlines():
line = line.strip()
if not line:
# Next symtab
toc = None
continue
words = re.split(r' +', line)
if line.startswith('Num'): # Header?
if toc is not None:
error("multiple headers in output of readelf")
# Colons are different across readelf versions so get rid of them.
toc = make_toc(map(lambda n: n.replace(':', ''), words))
elif toc is not None:
sym = parse_row(words, toc, ['Value'])
name = sym['Name']
if name in syms_set:
continue
syms_set.add(name)
sym['Size'] = int(sym['Size'], 0) # Readelf is inconistent on Size format
if '@' in name:
sym['Default'] = '@@' in name
name, ver = re.split(r'@+', name)
sym['Name'] = name
sym['Version'] = ver
else:
sym['Default'] = True
sym['Version'] = None
syms.append(sym)
toc = None
syms = []
syms_set = set()
for line in out.splitlines():
line = line.strip()
if not line:
# Next symtab
toc = None
continue
words = re.split(r" +", line)
if line.startswith("Num"): # Header?
if toc is not None:
error("multiple headers in output of readelf")
# Colons are different across readelf versions so get rid of them.
toc = make_toc(map(lambda n: n.replace(":", ""), words))
elif toc is not None:
sym = parse_row(words, toc, ["Value"])
name = sym["Name"]
if name in syms_set:
continue
syms_set.add(name)
sym["Size"] = int(sym["Size"], 0) # Readelf is inconistent on Size format
if "@" in name:
sym["Default"] = "@@" in name
name, ver = re.split(r"@+", name)
sym["Name"] = name
sym["Version"] = ver
else:
sym["Default"] = True
sym["Version"] = None
syms.append(sym)
if toc is None:
error(f"failed to analyze symbols in {f}")
if toc is None:
error(f"failed to analyze symbols in {f}")
# Also collected demangled names
if syms:
out, _ = run(['c++filt'], '\n'.join((sym['Name'] for sym in syms)))
for i, name in enumerate(out.split("\n")):
syms[i]['Demangled Name'] = name
# Also collected demangled names
if syms:
out, _ = run(["c++filt"], "\n".join((sym["Name"] for sym in syms)))
for i, name in enumerate(out.split("\n")):
syms[i]["Demangled Name"] = name
return syms
return syms
def collect_relocs(f):
"""Collect ELF dynamic relocs."""
"""Collect ELF dynamic relocs."""
out, _ = run(['readelf', '-rW', f])
out, _ = run(["readelf", "-rW", f])
toc = None
rels = []
for line in out.splitlines():
line = line.strip()
if not line:
toc = None
continue
if line == 'There are no relocations in this file.':
return []
if re.match(r'^\s*Offset', line): # Header?
if toc is not None:
error("multiple headers in output of readelf")
words = re.split(r'\s\s+', line) # "Symbol's Name + Addend"
toc = make_toc(words)
elif toc is not None:
line = re.sub(r' \+ ', '+', line)
words = re.split(r'\s+', line)
rel = parse_row(words, toc, ['Offset', 'Info'])
rels.append(rel)
# Split symbolic representation
sym_name = 'Symbol\'s Name + Addend'
if sym_name not in rel and 'Symbol\'s Name' in rel:
# Adapt to different versions of readelf
rel[sym_name] = rel['Symbol\'s Name'] + '+0'
if rel[sym_name]:
p = rel[sym_name].split('+')
if len(p) == 1:
p = ['', p[0]]
rel[sym_name] = (p[0], int(p[1], 16))
toc = None
rels = []
for line in out.splitlines():
line = line.strip()
if not line:
toc = None
continue
if line == "There are no relocations in this file.":
return []
if re.match(r"^\s*Offset", line): # Header?
if toc is not None:
error("multiple headers in output of readelf")
words = re.split(r"\s\s+", line) # "Symbol's Name + Addend"
toc = make_toc(words)
elif toc is not None:
line = re.sub(r" \+ ", "+", line)
words = re.split(r"\s+", line)
rel = parse_row(words, toc, ["Offset", "Info"])
rels.append(rel)
# Split symbolic representation
sym_name = "Symbol's Name + Addend"
if sym_name not in rel and "Symbol's Name" in rel:
# Adapt to different versions of readelf
rel[sym_name] = rel["Symbol's Name"] + "+0"
if rel[sym_name]:
p = rel[sym_name].split("+")
if len(p) == 1:
p = ["", p[0]]
rel[sym_name] = (p[0], int(p[1], 16))
if toc is None:
error(f"failed to analyze relocations in {f}")
if toc is None:
error(f"failed to analyze relocations in {f}")
return rels
return rels
def collect_sections(f):
"""Collect section info from ELF."""
"""Collect section info from ELF."""
out, _ = run(['readelf', '-SW', f])
out, _ = run(["readelf", "-SW", f])
toc = None
sections = []
for line in out.splitlines():
line = line.strip()
if not line:
continue
line = re.sub(r'\[\s+', '[', line)
words = re.split(r' +', line)
if line.startswith('[Nr]'): # Header?
if toc is not None:
error("multiple headers in output of readelf")
toc = make_toc(words, {'Addr' : 'Address'})
elif line.startswith('[') and toc is not None:
sec = parse_row(words, toc, ['Address', 'Off', 'Size'])
if 'A' in sec['Flg']: # Allocatable section?
sections.append(sec)
toc = None
sections = []
for line in out.splitlines():
line = line.strip()
if not line:
continue
line = re.sub(r"\[\s+", "[", line)
words = re.split(r" +", line)
if line.startswith("[Nr]"): # Header?
if toc is not None:
error("multiple headers in output of readelf")
toc = make_toc(words, {"Addr": "Address"})
elif line.startswith("[") and toc is not None:
sec = parse_row(words, toc, ["Address", "Off", "Size"])
if "A" in sec["Flg"]: # Allocatable section?
sections.append(sec)
if toc is None:
error(f"failed to analyze sections in {f}")
if toc is None:
error(f"failed to analyze sections in {f}")
return sections
return sections
def read_unrelocated_data(input_name, syms, secs):
"""Collect unrelocated data from ELF."""
data = {}
with open(input_name, 'rb') as f:
def is_symbol_in_section(sym, sec):
sec_end = sec['Address'] + sec['Size']
is_start_in_section = sec['Address'] <= sym['Value'] < sec_end
is_end_in_section = sym['Value'] + sym['Size'] <= sec_end
return is_start_in_section and is_end_in_section
for name, s in sorted(syms.items(), key=lambda s: s[1]['Value']):
# TODO: binary search (bisect)
sec = [sec for sec in secs if is_symbol_in_section(s, sec)]
if len(sec) != 1:
error(f"failed to locate section for interval [{s['Value']:x}, {s['Value'] + s['Size']:x})")
sec = sec[0]
f.seek(sec['Off'])
data[name] = f.read(s['Size'])
return data
"""Collect unrelocated data from ELF."""
data = {}
with open(input_name, "rb") as f:
def is_symbol_in_section(sym, sec):
sec_end = sec["Address"] + sec["Size"]
is_start_in_section = sec["Address"] <= sym["Value"] < sec_end
is_end_in_section = sym["Value"] + sym["Size"] <= sec_end
return is_start_in_section and is_end_in_section
for name, s in sorted(syms.items(), key=lambda s: s[1]["Value"]):
# TODO: binary search (bisect)
sec = [sec for sec in secs if is_symbol_in_section(s, sec)]
if len(sec) != 1:
error(f"failed to locate section for interval [{s['Value']:x}, {s['Value'] + s['Size']:x})")
sec = sec[0]
f.seek(sec["Off"])
data[name] = f.read(s["Size"])
return data
def collect_relocated_data(syms, bites, rels, ptr_size, reloc_types):
"""Identify relocations for each symbol"""
data = {}
for name, s in sorted(syms.items()):
b = bites.get(name)
assert b is not None
if s['Demangled Name'].startswith('typeinfo name'):
data[name] = [('byte', int(x)) for x in b]
continue
data[name] = []
for i in range(0, len(b), ptr_size):
val = int.from_bytes(b[i*ptr_size:(i + 1)*ptr_size], byteorder='little')
data[name].append(('offset', val))
start = s['Value']
finish = start + s['Size']
# TODO: binary search (bisect)
for rel in rels:
if rel['Type'] in reloc_types and start <= rel['Offset'] < finish:
i = (rel['Offset'] - start) // ptr_size
assert i < len(data[name])
data[name][i] = 'reloc', rel
return data
"""Identify relocations for each symbol"""
data = {}
for name, s in sorted(syms.items()):
b = bites.get(name)
assert b is not None
if s["Demangled Name"].startswith("typeinfo name"):
data[name] = [("byte", int(x)) for x in b]
continue
data[name] = []
for i in range(0, len(b), ptr_size):
val = int.from_bytes(b[i * ptr_size : (i + 1) * ptr_size], byteorder="little")
data[name].append(("offset", val))
start = s["Value"]
finish = start + s["Size"]
# TODO: binary search (bisect)
for rel in rels:
if rel["Type"] in reloc_types and start <= rel["Offset"] < finish:
i = (rel["Offset"] - start) // ptr_size
assert i < len(data[name])
data[name][i] = "reloc", rel
return data
def generate_vtables(cls_tables, cls_syms, cls_data):
"""Generate code for vtables"""
c_types = {
'reloc' : 'const void *',
'byte' : 'unsigned char',
'offset' : 'size_t'
}
"""Generate code for vtables"""
c_types = {"reloc": "const void *", "byte": "unsigned char", "offset": "size_t"}
ss = []
ss.append('''\
ss = []
ss.append(
"""\
#ifdef __cplusplus
extern "C" {
#endif
''')
"""
)
# Print externs
# Print externs
printed = set()
for name, data in sorted(cls_data.items()):
for typ, val in data:
if typ != 'reloc':
continue
sym_name, addend = val['Symbol\'s Name + Addend']
sym_name = re.sub(r'@.*', '', sym_name) # Can we pin version in C?
if sym_name not in cls_syms and sym_name not in printed:
ss.append(f'''\
printed = set()
for name, data in sorted(cls_data.items()):
for typ, val in data:
if typ != "reloc":
continue
sym_name, addend = val["Symbol's Name + Addend"]
sym_name = re.sub(r"@.*", "", sym_name) # Can we pin version in C?
if sym_name not in cls_syms and sym_name not in printed:
ss.append(
f"""\
extern const char {sym_name}[];
''')
"""
)
# Collect variable infos
# Collect variable infos
code_info = {}
code_info = {}
for name, s in sorted(cls_syms.items()):
data = cls_data[name]
if s['Demangled Name'].startswith('typeinfo name'):
declarator = 'const unsigned char %s[]'
else:
field_types = (f'{c_types[typ]} field_{i};' for i, (typ, _) in enumerate(data))
declarator = 'const struct { %s } %%s' % ' '.join(field_types) # pylint: disable=C0209 # consider-using-f-string
vals = []
for typ, val in data:
if typ != 'reloc':
vals.append(str(val) + 'UL')
else:
sym_name, addend = val['Symbol\'s Name + Addend']
sym_name = re.sub(r'@.*', '', sym_name) # Can we pin version in C?
vals.append(f'(const char *)&{sym_name} + {addend}')
code_info[name] = (declarator, '{ %s }' % ', '.join(vals)) # pylint: disable= C0209 # consider-using-f-string
for name, s in sorted(cls_syms.items()):
data = cls_data[name]
if s["Demangled Name"].startswith("typeinfo name"):
declarator = "const unsigned char %s[]"
else:
field_types = (f"{c_types[typ]} field_{i};" for i, (typ, _) in enumerate(data))
declarator = "const struct { %s } %%s" % " ".join(
field_types
) # pylint: disable=C0209 # consider-using-f-string
vals = []
for typ, val in data:
if typ != "reloc":
vals.append(str(val) + "UL")
else:
sym_name, addend = val["Symbol's Name + Addend"]
sym_name = re.sub(r"@.*", "", sym_name) # Can we pin version in C?
vals.append(f"(const char *)&{sym_name} + {addend}")
code_info[name] = (declarator, "{ %s }" % ", ".join(vals)) # pylint: disable= C0209 # consider-using-f-string
# Print declarations
# Print declarations
for name, (decl, _) in sorted(code_info.items()):
type_name = name + '_type'
type_decl = decl % type_name
ss.append(f'''\
for name, (decl, _) in sorted(code_info.items()):
type_name = name + "_type"
type_decl = decl % type_name
ss.append(
f"""\
typedef {type_decl};
extern __attribute__((weak)) {type_name} {name};
''')
"""
)
# Print definitions
# Print definitions
for name, (_, init) in sorted(code_info.items()):
type_name = name + '_type'
ss.append(f'''\
for name, (_, init) in sorted(code_info.items()):
type_name = name + "_type"
ss.append(
f"""\
const {type_name} {name} = {init};
''')
"""
)
ss.append('''\
ss.append(
"""\
#ifdef __cplusplus
} // extern "C"
#endif
''')
"""
)
return "".join(ss)
return ''.join(ss)
def main():
"""Driver function"""
parser = argparse.ArgumentParser(description="Generate wrappers for shared library functions.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""\
"""Driver function"""
parser = argparse.ArgumentParser(
description="Generate wrappers for shared library functions.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""\
Examples:
$ python3 {me} /usr/lib/x86_64-linux-gnu/libaccountsservice.so.0
Generating libaccountsservice.so.0.tramp.S...
Generating libaccountsservice.so.0.init.c...
""")
Generating libaccountsservice.so.0.init.cpp...
""",
)
parser.add_argument('library',
metavar='LIB',
help="Library to be wrapped.")
parser.add_argument('--verbose', '-v',
help="Print diagnostic info",
action='count',
default=0)
parser.add_argument('--dlopen-callback',
help="Call user-provided custom callback to load library instead of dlopen",
default='')
parser.add_argument('--dlopen',
help="Emit dlopen call (default)",
dest='dlopen', action='store_true', default=True)
parser.add_argument('--no-dlopen',
help="Do not emit dlopen call (user must load library himself)",
dest='dlopen', action='store_false')
parser.add_argument('--library-load-name',
help="Use custom name for dlopened library (default is LIB)")
parser.add_argument('--lazy-load',
help="Load library lazily on first call to one of it's functions (default)",
dest='lazy_load', action='store_true', default=True)
parser.add_argument('--no-lazy-load',
help="Load library eagerly at program start",
dest='lazy_load', action='store_false')
parser.add_argument('--vtables',
help="Intercept virtual tables (EXPERIMENTAL)",
dest='vtables', action='store_true', default=False)
parser.add_argument('--no-vtables',
help="Do not intercept virtual tables (default)",
dest='vtables', action='store_false')
parser.add_argument('--target',
help="Target platform triple e.g. x86_64-unknown-linux-gnu or arm-none-eabi "
"(atm x86_64, i[0-9]86, arm/armhf/armeabi, aarch64/armv8 "
"and e2k are supported)",
default=os.uname()[-1])
parser.add_argument('--symbol-list',
help="Path to file with symbols that should be present in wrapper "
"(all by default)")
parser.add_argument('--symbol-prefix',
metavar='PFX',
help="Prefix wrapper symbols with PFX",
default='')
parser.add_argument('-q', '--quiet',
help="Do not print progress info",
action='store_true')
parser.add_argument('--outdir', '-o',
help="Path to create wrapper at",
default='./')
parser.add_argument("library", metavar="LIB", help="Library to be wrapped.")
parser.add_argument("--verbose", "-v", help="Print diagnostic info", action="count", default=0)
parser.add_argument(
"--dlopen-callback", help="Call user-provided custom callback to load library instead of dlopen", default=""
)
parser.add_argument("--library-load-name", help="Use custom name for dlopened library (default is LIB)")
parser.add_argument(
"--vtables", help="Intercept virtual tables (EXPERIMENTAL)", dest="vtables", action="store_true", default=False
)
parser.add_argument(
"--no-vtables", help="Do not intercept virtual tables (default)", dest="vtables", action="store_false"
)
parser.add_argument(
"--target",
help="Target platform triple e.g. x86_64-unknown-linux-gnu or arm-none-eabi "
"(atm x86_64, i[0-9]86, arm/armhf/armeabi, aarch64/armv8 "
"and e2k are supported)",
default=os.uname()[-1],
)
parser.add_argument(
"--symbol-list", help="Path to file with symbols that should be present in wrapper " "(all by default)"
)
parser.add_argument("--symbol-prefix", metavar="PFX", help="Prefix wrapper symbols with PFX", default="")
parser.add_argument("-q", "--quiet", help="Do not print progress info", action="store_true")
parser.add_argument("--outdir", "-o", help="Path to create wrapper at", default="./")
args = parser.parse_args()
args = parser.parse_args()
input_name = args.library
verbose = args.verbose
dlopen_callback = args.dlopen_callback
dlopen = args.dlopen
lazy_load = args.lazy_load
load_name = args.library_load_name or os.path.basename(input_name)
if args.target.startswith('arm'):
target = 'arm' # Handle armhf-..., armel-...
elif re.match(r'^i[0-9]86', args.target):
target = 'i386'
else:
target = args.target.split('-')[0]
quiet = args.quiet
outdir = args.outdir
input_name = args.library
verbose = args.verbose
dlopen_callback = args.dlopen_callback
load_name = args.library_load_name or os.path.basename(input_name)
if args.target.startswith("arm"):
target = "arm" # Handle armhf-..., armel-...
elif re.match(r"^i[0-9]86", args.target):
target = "i386"
else:
target = args.target.split("-")[0]
quiet = args.quiet
outdir = args.outdir
if args.symbol_list is None:
funs = None
else:
with open(args.symbol_list, 'r') as f:
funs = []
for line in re.split(r'\r?\n', f.read()):
line = re.sub(r'#.*', '', line)
line = line.strip()
if line:
funs.append(line)
if args.symbol_list is None:
funs = None
else:
with open(args.symbol_list, "r") as f:
funs = []
for line in re.split(r"\r?\n", f.read()):
line = re.sub(r"#.*", "", line)
line = line.strip()
if line:
funs.append(line)
# Collect target info
# Collect target info
target_dir = os.path.join(root, 'arch', target)
target_dir = os.path.join(root, "arch", target)
if not os.path.exists(target_dir):
error(f"unknown architecture '{target}'")
if not os.path.exists(target_dir):
error(f"unknown architecture '{target}'")
cfg = configparser.ConfigParser(inline_comment_prefixes=';')
cfg.read(target_dir + '/config.ini')
cfg = configparser.ConfigParser(inline_comment_prefixes=";")
cfg.read(target_dir + "/config.ini")
ptr_size = int(cfg['Arch']['PointerSize'])
symbol_reloc_types = set(re.split(r'\s*,\s*', cfg['Arch']['SymbolReloc']))
ptr_size = int(cfg["Arch"]["PointerSize"])
symbol_reloc_types = set(re.split(r"\s*,\s*", cfg["Arch"]["SymbolReloc"]))
def is_exported(s):
return (s['Bind'] != 'LOCAL'
and s['Type'] != 'NOTYPE'
and s['Ndx'] != 'UND'
and s['Name'] not in ['', '_init', '_fini'])
def is_exported(s):
return (
s["Bind"] != "LOCAL"
and s["Type"] != "NOTYPE"
and s["Ndx"] != "UND"
and s["Name"] not in ["", "_init", "_fini"]
)
syms = list(filter(is_exported, collect_syms(input_name)))
syms = list(filter(is_exported, collect_syms(input_name)))
def is_data_symbol(s):
return (s['Type'] == 'OBJECT'
def is_data_symbol(s):
return (
s["Type"] == "OBJECT"
# Allow vtables if --vtables is on
and not (' for ' in s['Demangled Name'] and args.vtables))
and not (" for " in s["Demangled Name"] and args.vtables)
)
exported_data = [s['Name'] for s in syms if is_data_symbol(s)]
if exported_data:
# TODO: we can generate wrappers for const data without relocations (or only code relocations)
warn(f"library '{input_name}' contains data symbols which won't be intercepted: "
+ ', '.join(exported_data))
exported_data = [s["Name"] for s in syms if is_data_symbol(s)]
if exported_data:
# TODO: we can generate wrappers for const data without relocations (or only code relocations)
warn(f"library '{input_name}' contains data symbols which won't be intercepted: " + ", ".join(exported_data))
# Collect functions
# TODO: warn if user-specified functions are missing
# Collect functions
# TODO: warn if user-specified functions are missing
orig_funs = filter(lambda s: s['Type'] == 'FUNC', syms)
orig_funs = filter(lambda s: s["Type"] == "FUNC", syms)
all_funs = set()
warn_versioned = False
for s in orig_funs:
if s['Version'] is not None:
# TODO: support versions
if not warn_versioned:
warn(f"library {input_name} contains versioned symbols which are NYI")
warn_versioned = True
if verbose:
print(f"Skipping versioned symbol {s['Name']}")
continue
all_funs.add(s['Name'])
all_funs = set()
warn_versioned = False
for s in orig_funs:
if s["Version"] is not None:
# TODO: support versions
if not warn_versioned:
warn(f"library {input_name} contains versioned symbols which are NYI")
warn_versioned = True
if verbose:
print(f"Skipping versioned symbol {s['Name']}")
continue
all_funs.add(s["Name"])
if funs is None:
funs = sorted(list(all_funs))
if not funs and not quiet:
warn(f"no public functions were found in {input_name}")
else:
missing_funs = [name for name in funs if name not in all_funs]
if missing_funs:
warn("some user-specified functions are not present in library: " + ', '.join(missing_funs))
funs = [name for name in funs if name in all_funs]
if verbose:
print("Exported functions:")
for i, fun in enumerate(funs):
print(f" {i}: {fun}")
# Collect vtables
if args.vtables:
cls_tables = {}
cls_syms = {}
for s in syms:
m = re.match(r'^(vtable|typeinfo|typeinfo name) for (.*)', s['Demangled Name'])
if m is not None and is_exported(s):
typ, cls = m.groups()
name = s['Name']
cls_tables.setdefault(cls, {})[typ] = name
cls_syms[name] = s
if funs is None:
funs = sorted(list(all_funs))
if not funs and not quiet:
warn(f"no public functions were found in {input_name}")
else:
missing_funs = [name for name in funs if name not in all_funs]
if missing_funs:
warn("some user-specified functions are not present in library: " + ", ".join(missing_funs))
funs = [name for name in funs if name in all_funs]
if verbose:
print("Exported classes:")
for cls, _ in sorted(cls_tables.items()):
print(f" {cls}")
print("Exported functions:")
for i, fun in enumerate(funs):
print(f" {i}: {fun}")
secs = collect_sections(input_name)
if verbose:
print("Sections:")
for sec in secs:
print(f" {sec['Name']}: [{sec['Address']:x}, {sec['Address'] + sec['Size']:x}), "
f"at {sec['Off']:x}")
# Collect vtables
bites = read_unrelocated_data(input_name, cls_syms, secs)
rels = collect_relocs(input_name)
if verbose:
print("Relocs:")
for rel in rels:
sym_add = rel['Symbol\'s Name + Addend']
print(f" {rel['Offset']}: {sym_add}")
cls_data = collect_relocated_data(cls_syms, bites, rels, ptr_size, symbol_reloc_types)
if verbose:
print("Class data:")
for name, data in sorted(cls_data.items()):
demangled_name = cls_syms[name]['Demangled Name']
print(f" {name} ({demangled_name}):")
for typ, val in data:
print(" " + str(val if typ != 'reloc' else val['Symbol\'s Name + Addend']))
# Generate assembly code
suffix = os.path.basename(load_name)
lib_suffix = re.sub(r'[^a-zA-Z_0-9]+', '_', suffix)
tramp_file = f'{suffix}.tramp.S'
with open(os.path.join(outdir, tramp_file), 'w') as f:
if not quiet:
print(f"Generating {tramp_file}...")
with open(target_dir + '/table.S.tpl', 'r') as t:
table_text = string.Template(t.read()).substitute(
lib_suffix=lib_suffix,
table_size=ptr_size*(len(funs) + 1))
f.write(table_text)
with open(target_dir + '/trampoline.S.tpl', 'r') as t:
tramp_tpl = string.Template(t.read())
for i, name in enumerate(funs):
tramp_text = tramp_tpl.substitute(
lib_suffix=lib_suffix,
sym=args.symbol_prefix + name,
offset=i*ptr_size,
number=i)
f.write(tramp_text)
# Generate C code
init_file = f'{suffix}.init.c'
with open(os.path.join(outdir, init_file), 'w') as f:
if not quiet:
print(f"Generating {init_file}...")
with open(os.path.join(root, 'arch/common/init.c.tpl'), 'r') as t:
if funs:
sym_names = ',\n '.join(f'"{name}"' for name in funs) + ','
else:
sym_names = ''
init_text = string.Template(t.read()).substitute(
lib_suffix=lib_suffix,
load_name=load_name,
dlopen_callback=dlopen_callback,
has_dlopen_callback=int(bool(dlopen_callback)),
no_dlopen=not int(dlopen),
lazy_load=int(lazy_load),
sym_names=sym_names)
f.write(init_text)
if args.vtables:
vtable_text = generate_vtables(cls_tables, cls_syms, cls_data)
f.write(vtable_text)
cls_tables = {}
cls_syms = {}
if __name__ == '__main__':
main()
for s in syms:
m = re.match(r"^(vtable|typeinfo|typeinfo name) for (.*)", s["Demangled Name"])
if m is not None and is_exported(s):
typ, cls = m.groups()
name = s["Name"]
cls_tables.setdefault(cls, {})[typ] = name
cls_syms[name] = s
if verbose:
print("Exported classes:")
for cls, _ in sorted(cls_tables.items()):
print(f" {cls}")
secs = collect_sections(input_name)
if verbose:
print("Sections:")
for sec in secs:
print(f" {sec['Name']}: [{sec['Address']:x}, {sec['Address'] + sec['Size']:x}), " f"at {sec['Off']:x}")
bites = read_unrelocated_data(input_name, cls_syms, secs)
rels = collect_relocs(input_name)
if verbose:
print("Relocs:")
for rel in rels:
sym_add = rel["Symbol's Name + Addend"]
print(f" {rel['Offset']}: {sym_add}")
cls_data = collect_relocated_data(cls_syms, bites, rels, ptr_size, symbol_reloc_types)
if verbose:
print("Class data:")
for name, data in sorted(cls_data.items()):
demangled_name = cls_syms[name]["Demangled Name"]
print(f" {name} ({demangled_name}):")
for typ, val in data:
print(" " + str(val if typ != "reloc" else val["Symbol's Name + Addend"]))
# Generate assembly code
suffix = os.path.basename(load_name)
lib_suffix = re.sub(r"[^a-zA-Z_0-9]+", "_", suffix)
tramp_file = f"{suffix}.tramp.S"
with open(os.path.join(outdir, tramp_file), "w") as f:
if not quiet:
print(f"Generating {tramp_file}...")
with open(target_dir + "/table.S.tpl", "r") as t:
table_text = string.Template(t.read()).substitute(
lib_suffix=lib_suffix, table_size=ptr_size * (len(funs) + 1)
)
f.write(table_text)
with open(target_dir + "/trampoline.S.tpl", "r") as t:
tramp_tpl = string.Template(t.read())
for i, name in enumerate(funs):
tramp_text = tramp_tpl.substitute(
lib_suffix=lib_suffix, sym=args.symbol_prefix + name, offset=i * ptr_size, number=i
)
f.write(tramp_text)
# Generate C code
init_file = f"{suffix}.init.cpp"
with open(os.path.join(outdir, init_file), "w") as f:
if not quiet:
print(f"Generating {init_file}...")
with open(os.path.join(root, "arch/common/init.cpp.tpl"), "r") as t:
if funs:
sym_names = ",\n ".join(f'"{name}"' for name in funs) + ","
else:
sym_names = ""
init_text = string.Template(t.read()).substitute(
lib_suffix=lib_suffix,
load_name=load_name,
dlopen_callback=dlopen_callback,
has_dlopen_callback=int(bool(dlopen_callback)),
sym_names=sym_names,
)
f.write(init_text)
if args.vtables:
vtable_text = generate_vtables(cls_tables, cls_syms, cls_data)
f.write(vtable_text)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,7 @@
#! /usr/bin/env bash
set -e
set -o pipefail
# It is necessary to tee to output.log in case timeout happens
python3 ./binding_test.py --stop-at-failure 10 --fdbserver-path $(pwd)/fdbserver --fdbcli-path $(pwd)/fdbcli --libfdb-path $(pwd) --num-ops 1000 --num-hca-ops 100 --concurrency 5 --test-timeout 60 --random | tee output.log

View File

@ -0,0 +1,4 @@
#! /usr/bin/env bash
echo "Binding test timed out"
cat output.log

0
contrib/Joshua/scripts/localClusterStart.sh Normal file → Executable file
View File

View File

@ -60,7 +60,7 @@ class StatFetcher:
class TestPicker:
def __init__(self, test_dir: Path):
if not test_dir.exists():
raise RuntimeError('{} is neither a directory nor a file'.format(test_dir))
raise RuntimeError("{} is neither a directory nor a file".format(test_dir))
self.include_files_regex = re.compile(config.include_test_files)
self.exclude_files_regex = re.compile(config.exclude_test_files)
self.include_tests_regex = re.compile(config.include_test_classes)
@ -78,6 +78,7 @@ class TestPicker:
self.stat_fetcher = StatFetcher(self.tests)
else:
from test_harness.fdb import FDBStatFetcher
self.stat_fetcher = FDBStatFetcher(self.tests)
if config.stats is not None:
self.load_stats(config.stats)
@ -106,50 +107,60 @@ class TestPicker:
break
assert test_name is not None and test_desc is not None
self.stat_fetcher.add_run_time(test_name, run_time, out)
out.attributes['TotalTestTime'] = str(test_desc.total_runtime)
out.attributes['TestRunCount'] = str(test_desc.num_runs)
out.attributes["TotalTestTime"] = str(test_desc.total_runtime)
out.attributes["TestRunCount"] = str(test_desc.num_runs)
def dump_stats(self) -> str:
res = array.array('I')
res = array.array("I")
for _, spec in self.tests.items():
res.append(spec.total_runtime)
return base64.standard_b64encode(res.tobytes()).decode('utf-8')
return base64.standard_b64encode(res.tobytes()).decode("utf-8")
def fetch_stats(self):
self.stat_fetcher.read_stats()
def load_stats(self, serialized: str):
times = array.array('I')
times = array.array("I")
times.frombytes(base64.standard_b64decode(serialized))
assert len(times) == len(self.tests.items())
for idx, (_, spec) in enumerate(self.tests.items()):
spec.total_runtime = times[idx]
def parse_txt(self, path: Path):
if self.include_files_regex.search(str(path)) is None or self.exclude_files_regex.search(str(path)) is not None:
if (
self.include_files_regex.search(str(path)) is None
or self.exclude_files_regex.search(str(path)) is not None
):
return
with path.open('r') as f:
with path.open("r") as f:
test_name: str | None = None
test_class: str | None = None
priority: float | None = None
for line in f:
line = line.strip()
kv = line.split('=')
kv = line.split("=")
if len(kv) != 2:
continue
kv[0] = kv[0].strip()
kv[1] = kv[1].strip(' \r\n\t\'"')
if kv[0] == 'testTitle' and test_name is None:
kv[1] = kv[1].strip(" \r\n\t'\"")
if kv[0] == "testTitle" and test_name is None:
test_name = kv[1]
if kv[0] == 'testClass' and test_class is None:
if kv[0] == "testClass" and test_class is None:
test_class = kv[1]
if kv[0] == 'testPriority' and priority is None:
if kv[0] == "testPriority" and priority is None:
try:
priority = float(kv[1])
except ValueError:
raise RuntimeError("Can't parse {} -- testPriority in {} should be set to a float".format(kv[1],
path))
if test_name is not None and test_class is not None and priority is not None:
raise RuntimeError(
"Can't parse {} -- testPriority in {} should be set to a float".format(
kv[1], path
)
)
if (
test_name is not None
and test_class is not None
and priority is not None
):
break
if test_name is None:
return
@ -157,8 +168,10 @@ class TestPicker:
test_class = test_name
if priority is None:
priority = 1.0
if self.include_tests_regex.search(test_class) is None \
or self.exclude_tests_regex.search(test_class) is not None:
if (
self.include_tests_regex.search(test_class) is None
or self.exclude_tests_regex.search(test_class) is not None
):
return
if test_class not in self.tests:
self.tests[test_class] = TestDescription(path, test_class, priority)
@ -173,12 +186,12 @@ class TestPicker:
# check whether we're looking at a restart test
if self.follow_test.match(test.name) is not None:
return
if test.suffix == '.txt' or test.suffix == '.toml':
if test.suffix == ".txt" or test.suffix == ".toml":
self.parse_txt(test)
@staticmethod
def list_restart_files(start_file: Path) -> List[Path]:
name = re.sub(r'-\d+.(txt|toml)', '', start_file.name)
name = re.sub(r"-\d+.(txt|toml)", "", start_file.name)
res: List[Path] = []
for test_file in start_file.parent.iterdir():
if test_file.name.startswith(name):
@ -209,12 +222,12 @@ class TestPicker:
class OldBinaries:
def __init__(self):
self.first_file_expr = re.compile(r'.*-1\.(txt|toml)')
self.first_file_expr = re.compile(r".*-1\.(txt|toml)")
self.old_binaries_path: Path = config.old_binaries_path
self.binaries: OrderedDict[Version, Path] = collections.OrderedDict()
if not self.old_binaries_path.exists() or not self.old_binaries_path.is_dir():
return
exec_pattern = re.compile(r'fdbserver-\d+\.\d+\.\d+(\.exe)?')
exec_pattern = re.compile(r"fdbserver-\d+\.\d+\.\d+(\.exe)?")
for file in self.old_binaries_path.iterdir():
if not file.is_file() or not os.access(file, os.X_OK):
continue
@ -222,9 +235,9 @@ class OldBinaries:
self._add_file(file)
def _add_file(self, file: Path):
version_str = file.name.split('-')[1]
if version_str.endswith('.exe'):
version_str = version_str[0:-len('.exe')]
version_str = file.name.split("-")[1]
if version_str.endswith(".exe"):
version_str = version_str[0 : -len(".exe")]
ver = Version.parse(version_str)
self.binaries[ver] = file
@ -232,21 +245,21 @@ class OldBinaries:
if len(self.binaries) == 0:
return config.binary
max_version = Version.max_version()
min_version = Version.parse('5.0.0')
min_version = Version.parse("5.0.0")
dirs = test_file.parent.parts
if 'restarting' not in dirs:
if "restarting" not in dirs:
return config.binary
version_expr = dirs[-1].split('_')
version_expr = dirs[-1].split("_")
first_file = self.first_file_expr.match(test_file.name) is not None
if first_file and version_expr[0] == 'to':
if first_file and version_expr[0] == "to":
# downgrade test -- first binary should be current one
return config.binary
if not first_file and version_expr[0] == 'from':
if not first_file and version_expr[0] == "from":
# upgrade test -- we only return an old version for the first test file
return config.binary
if version_expr[0] == 'from' or version_expr[0] == 'to':
if version_expr[0] == "from" or version_expr[0] == "to":
min_version = Version.parse(version_expr[1])
if len(version_expr) == 4 and version_expr[2] == 'until':
if len(version_expr) == 4 and version_expr[2] == "until":
max_version = Version.parse(version_expr[3])
candidates: List[Path] = []
for ver, binary in self.binaries.items():
@ -259,13 +272,13 @@ class OldBinaries:
def is_restarting_test(test_file: Path):
for p in test_file.parts:
if p == 'restarting':
if p == "restarting":
return True
return False
def is_no_sim(test_file: Path):
return test_file.parts[-2] == 'noSim'
return test_file.parts[-2] == "noSim"
class ResourceMonitor(threading.Thread):
@ -291,9 +304,19 @@ class ResourceMonitor(threading.Thread):
class TestRun:
def __init__(self, binary: Path, test_file: Path, random_seed: int, uid: uuid.UUID,
restarting: bool = False, test_determinism: bool = False, buggify_enabled: bool = False,
stats: str | None = None, expected_unseed: int | None = None, will_restart: bool = False):
def __init__(
self,
binary: Path,
test_file: Path,
random_seed: int,
uid: uuid.UUID,
restarting: bool = False,
test_determinism: bool = False,
buggify_enabled: bool = False,
stats: str | None = None,
expected_unseed: int | None = None,
will_restart: bool = False,
):
self.binary = binary
self.test_file = test_file
self.random_seed = random_seed
@ -313,23 +336,31 @@ class TestRun:
self.temp_path = config.run_dir / str(self.uid)
# state for the run
self.retryable_error: bool = False
self.summary: Summary = Summary(binary, uid=self.uid, stats=self.stats, expected_unseed=self.expected_unseed,
will_restart=will_restart, long_running=config.long_running)
self.summary: Summary = Summary(
binary,
uid=self.uid,
stats=self.stats,
expected_unseed=self.expected_unseed,
will_restart=will_restart,
long_running=config.long_running,
)
self.run_time: int = 0
self.success = self.run()
def log_test_plan(self, out: SummaryTree):
test_plan: SummaryTree = SummaryTree('TestPlan')
test_plan.attributes['TestUID'] = str(self.uid)
test_plan.attributes['RandomSeed'] = str(self.random_seed)
test_plan.attributes['TestFile'] = str(self.test_file)
test_plan.attributes['Buggify'] = '1' if self.buggify_enabled else '0'
test_plan.attributes['FaultInjectionEnabled'] = '1' if self.fault_injection_enabled else '0'
test_plan.attributes['DeterminismCheck'] = '1' if self.test_determinism else '0'
test_plan: SummaryTree = SummaryTree("TestPlan")
test_plan.attributes["TestUID"] = str(self.uid)
test_plan.attributes["RandomSeed"] = str(self.random_seed)
test_plan.attributes["TestFile"] = str(self.test_file)
test_plan.attributes["Buggify"] = "1" if self.buggify_enabled else "0"
test_plan.attributes["FaultInjectionEnabled"] = (
"1" if self.fault_injection_enabled else "0"
)
test_plan.attributes["DeterminismCheck"] = "1" if self.test_determinism else "0"
out.append(test_plan)
def delete_simdir(self):
shutil.rmtree(self.temp_path / Path('simfdb'))
shutil.rmtree(self.temp_path / Path("simfdb"))
def run(self):
command: List[str] = []
@ -341,47 +372,68 @@ class TestRun:
# the test take longer. Also old binaries weren't built with
# USE_VALGRIND=ON, and we have seen false positives with valgrind in
# such binaries.
command.append('valgrind')
valgrind_file = self.temp_path / Path('valgrind-{}.xml'.format(self.random_seed))
dbg_path = os.getenv('FDB_VALGRIND_DBGPATH')
command.append("valgrind")
valgrind_file = self.temp_path / Path(
"valgrind-{}.xml".format(self.random_seed)
)
dbg_path = os.getenv("FDB_VALGRIND_DBGPATH")
if dbg_path is not None:
command.append('--extra-debuginfo-path={}'.format(dbg_path))
command += ['--xml=yes', '--xml-file={}'.format(valgrind_file.absolute()), '-q']
command += [str(self.binary.absolute()),
'-r', 'test' if is_no_sim(self.test_file) else 'simulation',
'-f', str(self.test_file),
'-s', str(self.random_seed)]
command.append("--extra-debuginfo-path={}".format(dbg_path))
command += [
"--xml=yes",
"--xml-file={}".format(valgrind_file.absolute()),
"-q",
]
command += [
str(self.binary.absolute()),
"-r",
"test" if is_no_sim(self.test_file) else "simulation",
"-f",
str(self.test_file),
"-s",
str(self.random_seed),
]
if self.trace_format is not None:
command += ['--trace_format', self.trace_format]
command += ["--trace_format", self.trace_format]
if self.use_tls_plugin:
command += ['--tls_plugin', str(config.tls_plugin_path)]
command += ["--tls_plugin", str(config.tls_plugin_path)]
env["FDB_TLS_PLUGIN"] = str(config.tls_plugin_path)
if config.disable_kaio:
command += ['--knob-disable-posix-kernel-aio=1']
if Version.of_binary(self.binary) >= '7.1.0':
command += ['-fi', 'on' if self.fault_injection_enabled else 'off']
command += ["--knob-disable-posix-kernel-aio=1"]
if Version.of_binary(self.binary) >= "7.1.0":
command += ["-fi", "on" if self.fault_injection_enabled else "off"]
if self.restarting:
command.append('--restarting')
command.append("--restarting")
if self.buggify_enabled:
command += ['-b', 'on']
command += ["-b", "on"]
if config.crash_on_error:
command.append('--crash')
command.append("--crash")
if config.long_running:
# disable simulation speedup
command += ['--knob-sim-speedup-after-seconds=36000']
command += ["--knob-sim-speedup-after-seconds=36000"]
# disable traceTooManyLines Error MAX_TRACE_LINES
command += ['--knob-max-trace-lines=1000000000']
command += ["--knob-max-trace-lines=1000000000"]
self.temp_path.mkdir(parents=True, exist_ok=True)
# self.log_test_plan(out)
resources = ResourceMonitor()
resources.start()
process = subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, cwd=self.temp_path,
text=True, env=env)
process = subprocess.Popen(
command,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
cwd=self.temp_path,
text=True,
env=env,
)
did_kill = False
# No timeout for long running tests
timeout = 20 * config.kill_seconds if self.use_valgrind else (None if config.long_running else config.kill_seconds)
timeout = (
20 * config.kill_seconds
if self.use_valgrind
else (None if config.long_running else config.kill_seconds)
)
err_out: str
try:
_, err_out = process.communicate(timeout=timeout)
@ -398,7 +450,7 @@ class TestRun:
self.summary.was_killed = did_kill
self.summary.valgrind_out_file = valgrind_file
self.summary.error_out = err_out
self.summary.summarize(self.temp_path, ' '.join(command))
self.summary.summarize(self.temp_path, " ".join(command))
return self.summary.ok()
@ -407,18 +459,18 @@ def decorate_summary(out: SummaryTree, test_file: Path, seed: int, buggify: bool
tests are then hard to reproduce (they can be reproduced through TestHarness but
require the user to run in the joshua docker container). To account for this we
will write the necessary information into the attributes if it is missing."""
if 'TestFile' not in out.attributes:
out.attributes['TestFile'] = str(test_file)
if 'RandomSeed' not in out.attributes:
out.attributes['RandomSeed'] = str(seed)
if 'BuggifyEnabled' not in out.attributes:
out.attributes['BuggifyEnabled'] = '1' if buggify else '0'
if "TestFile" not in out.attributes:
out.attributes["TestFile"] = str(test_file)
if "RandomSeed" not in out.attributes:
out.attributes["RandomSeed"] = str(seed)
if "BuggifyEnabled" not in out.attributes:
out.attributes["BuggifyEnabled"] = "1" if buggify else "0"
class TestRunner:
def __init__(self):
self.uid = uuid.uuid4()
self.test_path: Path = Path('tests')
self.test_path: Path = Path("tests")
self.cluster_file: str | None = None
self.fdb_app_dir: str | None = None
self.binary_chooser = OldBinaries()
@ -426,32 +478,43 @@ class TestRunner:
def backup_sim_dir(self, seed: int):
temp_dir = config.run_dir / str(self.uid)
src_dir = temp_dir / 'simfdb'
src_dir = temp_dir / "simfdb"
assert src_dir.is_dir()
dest_dir = temp_dir / 'simfdb.{}'.format(seed)
dest_dir = temp_dir / "simfdb.{}".format(seed)
assert not dest_dir.exists()
shutil.copytree(src_dir, dest_dir)
def restore_sim_dir(self, seed: int):
temp_dir = config.run_dir / str(self.uid)
src_dir = temp_dir / 'simfdb.{}'.format(seed)
src_dir = temp_dir / "simfdb.{}".format(seed)
assert src_dir.exists()
dest_dir = temp_dir / 'simfdb'
dest_dir = temp_dir / "simfdb"
shutil.rmtree(dest_dir)
shutil.move(src_dir, dest_dir)
def run_tests(self, test_files: List[Path], seed: int, test_picker: TestPicker) -> bool:
def run_tests(
self, test_files: List[Path], seed: int, test_picker: TestPicker
) -> bool:
result: bool = True
for count, file in enumerate(test_files):
will_restart = count + 1 < len(test_files)
binary = self.binary_chooser.choose_binary(file)
unseed_check = not is_no_sim(file) and config.random.random() < config.unseed_check_ratio
unseed_check = (
not is_no_sim(file)
and config.random.random() < config.unseed_check_ratio
)
buggify_enabled: bool = config.random.random() < config.buggify_on_ratio
if unseed_check and count != 0:
# for restarting tests we will need to restore the sim2 after the first run
self.backup_sim_dir(seed + count - 1)
run = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
stats=test_picker.dump_stats(), will_restart=will_restart, buggify_enabled=buggify_enabled)
# FIXME: support unseed checks for restarting tests
run = TestRun(
binary,
file.absolute(),
seed + count,
self.uid,
restarting=count != 0,
stats=test_picker.dump_stats(),
will_restart=will_restart,
buggify_enabled=buggify_enabled,
)
result = result and run.success
test_picker.add_time(test_files[0], run.run_time, run.summary.out)
decorate_summary(run.summary.out, file, seed + count, run.buggify_enabled)
@ -460,14 +523,22 @@ class TestRunner:
run.summary.out.dump(sys.stdout)
if not result:
return False
if unseed_check and run.summary.unseed is not None:
if count != 0:
self.restore_sim_dir(seed + count - 1)
run2 = TestRun(binary, file.absolute(), seed + count, self.uid, restarting=count != 0,
stats=test_picker.dump_stats(), expected_unseed=run.summary.unseed,
will_restart=will_restart, buggify_enabled=buggify_enabled)
if count == 0 and unseed_check and run.summary.unseed is not None:
run2 = TestRun(
binary,
file.absolute(),
seed + count,
self.uid,
restarting=count != 0,
stats=test_picker.dump_stats(),
expected_unseed=run.summary.unseed,
will_restart=will_restart,
buggify_enabled=buggify_enabled,
)
test_picker.add_time(file, run2.run_time, run.summary.out)
decorate_summary(run2.summary.out, file, seed + count, run.buggify_enabled)
decorate_summary(
run2.summary.out, file, seed + count, run.buggify_enabled
)
run2.summary.out.dump(sys.stdout)
result = result and run2.success
if not result:
@ -475,7 +546,11 @@ class TestRunner:
return result
def run(self) -> bool:
seed = config.random_seed if config.random_seed is not None else config.random.randint(0, 2 ** 32 - 1)
seed = (
config.random_seed
if config.random_seed is not None
else config.random.randint(0, 2**32 - 1)
)
test_files = self.test_picker.choose_test()
success = self.run_tests(test_files, seed, self.test_picker)
if config.clean_up:

View File

@ -5,6 +5,7 @@ import asyncio
import logging
import os
import os.path
import random
import sys
import lib.fdb_process
@ -45,17 +46,23 @@ def _setup_logs(log_level: int = logging.INFO):
logger.handlers.clear()
stdout_handler = logging.StreamHandler(stream=sys.stderr)
stdout_handler = logging.StreamHandler(stream=sys.stdout)
stdout_handler.setLevel(log_level)
stdout_handler.setFormatter(log_format)
logger.addHandler(stdout_handler)
logger.setLevel(log_level)
# Here we might lose some of the logging from lib
# Here we might lose some of the logging from lib as the logger is set after
# importing the modules
lib_logger = logging.getLogger("lib")
lib_logger.addHandler(stdout_handler)
lib_logger.setLevel(log_level)
local_cluster_logger = logging.getLogger("local_cluster")
local_cluster_logger.addHandler(stdout_handler)
local_cluster_logger.setLevel(log_level)
def _setup_args() -> argparse.Namespace:
"""Parse the command line arguments"""
@ -109,6 +116,12 @@ def _setup_args() -> argparse.Namespace:
default=DEFAULT_TIMEOUT_PER_TEST,
help="Timeout for each single test",
)
parser.add_argument(
"--random",
action="store_true",
default=False,
help="Randomly pick up a test",
)
return parser.parse_args()
@ -137,11 +150,16 @@ class TestSet:
self._concurrency = concurrency
self._timeout = timeout
self._logging_level = logging_level
self._cluster_file = None
self._env = dict(os.environ)
self._update_path_from_env("LD_LIBRARY_PATH", ld_library_path)
self._update_path_from_env("PYTHONPATH", DEFAULT_PYTHON_BINDER)
def set_cluster_file(self, cluster_file: str):
"""Sets the cluster file for the test"""
self._cluster_file = cluster_file
def _update_path_from_env(self, environment_variable_name: str, new_path: str):
original_path = os.getenv(environment_variable_name)
self._env[environment_variable_name] = (
@ -159,6 +177,8 @@ class TestSet:
):
arguments = [
api_language,
"--cluster-file",
self._cluster_file,
"--test-name",
test_name,
"--logging-level",
@ -190,6 +210,7 @@ class TestSet:
test_name: str,
additional_args: List[str],
):
assert self._cluster_file is not None, "Must set cluster file before the test"
logger.debug(f"Run test API [{api_language}] Test name [{test_name}]")
try:
await self._test_coroutine(
@ -272,9 +293,7 @@ def _log_cluster_lines_with_severity(
else:
reporter = logger.debug
if len(lines) == 0:
reporter(f"{log_file}: No Severity={severity} lines")
else:
if len(lines) > 0:
reporter(
"{}: {} lines with Severity={}\n{}".format(
log_file, len(lines), severity, "".join(lines)
@ -282,9 +301,7 @@ def _log_cluster_lines_with_severity(
)
async def run_binding_tests(
test_set: TestSet, num_cycles: int, stop_at_failure: int = None
):
def _generate_test_list(test_set: TestSet, api_languages: List[str]):
tests = [
test_set.run_scripted_test,
test_set.run_api_test,
@ -292,39 +309,78 @@ async def run_binding_tests(
test_set.run_directory_test,
test_set.run_directory_hca_test,
]
return [
lambda: test(api_language) for test in tests for api_language in API_LANGUAGES
]
async def run_binding_tests(
test_set: TestSet,
num_cycles: int,
stop_at_failure: int = None,
random_pick_single: bool = False,
) -> int:
"""Run the binding tests
:param TestSet test_set:
:param int num_cycles: Number of tests to run
:param int stop_at_failure: Stop at i-th failure, defaults to None
:param bool random_pick_single: Randomly pick a single test, defaults to False
:return int: Number of failures
"""
tests = _generate_test_list(test_set=test_set, api_languages=API_LANGUAGES)
num_failures: int = 0
async def run_tests():
nonlocal num_failures
for api_language in API_LANGUAGES:
for test in tests:
test_success = await test(api_language)
if not test_success:
num_failures += 1
if stop_at_failure and num_failures > stop_at_failure:
raise RuntimeError(
f"Maximum number of test failures have reached"
)
for test in tests:
test_success = await test()
if not test_success:
num_failures += 1
if stop_at_failure and num_failures > stop_at_failure:
return
async def run_test_random():
nonlocal num_failures
test = random.choice(tests)
test_success = await test()
if not test_success:
num_failures += 1
async def run_test_cycles() -> int:
for cycle in range(num_cycles):
logger.info(f"Starting cycle {cycle}")
if random_pick_single:
await run_test_random()
else:
await run_tests()
if stop_at_failure and num_failures > stop_at_failure:
logger.error(
f"Reached maximum failures of {num_failures}, prematurely terminating"
)
return num_failures
return num_failures
async with lib.local_cluster.FDBServerLocalCluster(1) as local_cluster:
logger.info("Start binding test")
test_set.set_cluster_file(local_cluster)
try:
for cycle in range(num_cycles):
logger.info(f"Starting cycle {cycle}")
await run_tests()
await run_test_cycles()
except:
logger.exception("Error found during the binding test")
raise
finally:
logger.info(f"Binding test completed with {num_failures} failures")
_log_cluster_lines_with_severity(local_cluster, 40)
_log_cluster_lines_with_severity(local_cluster, 30)
return num_failures
def main():
def main() -> int:
args = _setup_args()
_setup_logs(args.debug)
_setup_logs(logging.DEBUG if args.debug else logging.INFO)
_check_file(args.fdbserver_path, True)
_check_file(args.fdbcli_path, True)
@ -333,12 +389,12 @@ def main():
lib.fdb_process.set_fdbserver_path(args.fdbserver_path)
lib.fdb_process.set_fdbcli_path(args.fdbcli_path)
logger.info(f"Executable: {__file__}")
logger.info(f"PID: {os.getpid()}")
logger.info(f"fdbserver: {args.fdbserver_path}")
logger.info(f"fdbcli: {args.fdbcli_path}")
logger.info(f"libfdb: {args.libfdb_path}")
logger.info(f"NumCycles: {args.num_cycles}")
logger.debug(f"Executable: {__file__}")
logger.debug(f"PID: {os.getpid()}")
logger.debug(f"fdbserver: {args.fdbserver_path}")
logger.debug(f"fdbcli: {args.fdbcli_path}")
logger.debug(f"libfdb: {args.libfdb_path}")
logger.debug(f"NumCycles: {args.num_cycles}")
test_set = TestSet(
binding_tester=args.binding_tester_path,
@ -349,9 +405,13 @@ def main():
timeout=args.test_timeout,
)
asyncio.run(run_binding_tests(test_set, args.num_cycles, args.stop_at_failure))
logger.info(f"Binding test start")
num_failures = asyncio.run(
run_binding_tests(test_set, args.num_cycles, args.stop_at_failure, args.random)
)
logger.info(f"Binding test finished with {num_failures} failures")
return 0
return 0 if num_failures == 0 else 1
if __name__ == "__main__":

View File

@ -67,6 +67,8 @@ class _ExecutablePath:
path = overridden_path
if path is None:
path = shutil.which(self._executable)
else:
path = os.path.abspath(path)
if path is None or not os.path.exists(path):
raise FileNotFoundError(

View File

@ -63,7 +63,8 @@ async def run_fdbservers(num_processes, work_dir, cluster_file, port):
async with lib.local_cluster.FDBServerLocalCluster(
num_processes, work_dir, cluster_file, port
):
await asyncio.sleep(20)
while True:
await asyncio.sleep(1)
def main():

View File

@ -56,7 +56,6 @@ function(add_documentation_target)
${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${target}_done
DEPENDS ${SRCS}
WORKING_DIRECTORY ${venv_dir})
message(STATUS "add_custom_target(${target} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${target}_done)")
add_custom_target(${target} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${target}_done)
add_dependencies(${target} buildsphinx)
endfunction()
@ -85,7 +84,6 @@ else()
string(MD5 username_hash ${username})
# cmake math function can only use 64 bit signed integers - so we just truncate the string
string(SUBSTRING "${username_hash}" 0 15 username_hash_small)
message(STATUS math(EXPR port "(0x${username_hash_small} % 8000) + 8000" OUTPUT_FORMAT DECIMAL))
math(EXPR port "(0x${username_hash_small} % 8000) + 8000" OUTPUT_FORMAT DECIMAL)
message(STATUS "Port is ${port}")
endif()

View File

@ -131,12 +131,35 @@ The default is ``disabled``, which means changing the storage engine will not be
``aggressive`` tries to replace as many storages as it can at once, and will recruit a new storage server on the same process as the old one. This will be faster, but can potentially hit degraded performance or OOM with two storages on the same process. The main benefit over ``gradual`` is that this doesn't need to take one storage out of rotation, so it works for small or development clusters that have the same number of storage processes as the replication factor. Note that ``aggressive`` is not exclusive to running the perpetual wiggle.
``disabled`` means that if the storage engine is changed, fdb will not move the cluster over to the new storage engine. This will disable the perpetual wiggle from rewriting storage files.
consistencyscan
----------------
This command controls a native data consistency scan role that is automatically recruited in the FDB cluster. The consistency scan reads all replicas of each shard to verify data consistency. It is useful for finding corrupt cold data by ensuring that all data is read periodically. Any errors found will be logged as TraceEvents with Severity = 40.
The syntax is
``consistencyscan [ off | on [maxRate <RATE>] [targetInterval <INTERVAL>] [restart <RESTART>] ]``
* ``off`` will disable the consistency scan
* ``on`` will enable the scan and can be accompanied by additional options shown above
* ``RATE`` - sets the maximum read speed of the scan in bytes/s.
* ``INTERVAL`` - sets the target completion time, in seconds, for each full pass over all data in the cluster. Scan speed will target this interval with a hard limit of RATE.
* ``RESTART`` - a 1 or 0 and controls whether the process should restart from the beginning of userspace on startup or not. This should normally be set to 0 which will resume progress from the last time the scan was running.
The consistency scan role publishes its configuration and metrics in Status JSON under the path ``.cluster.consistency_scan_info``.
consistencycheck
----------------
The ``consistencycheck`` command enables or disables consistency checking. Its syntax is ``consistencycheck [on|off]``. Calling it with ``on`` enables consistency checking, and ``off`` disables it. Calling it with no arguments displays whether consistency checking is currently enabled.
Note: This command exists for backward compatibility, it is suggested to use the ``consistencyscan`` command to control FDB's internal consistency scan role instead.
You must be running an ``fdbserver`` process with the ``consistencycheck`` role to perform consistency checking.
This command controls a key which controls behavior of any externally configured consistency check roles. You must be running an ``fdbserver`` process with the ``consistencycheck`` role to perform consistency checking.
The ``consistencycheck`` command enables or disables consistency checking. Its syntax is ``consistencycheck [on|off]``. Calling it with ``on`` enables consistency checking, and ``off`` disables it. Calling it with no arguments displays whether consistency checking is currently enabled.
coordinators
------------

View File

@ -909,6 +909,10 @@
"expired_age" : 0, // The age in seconds of expired_version.
"oldest_id_version" : 0, // The version of the oldest idempotency id still stored in the database.
"oldest_id_age" : 0 // The age in seconds of the oldest_id_version.
},
"version_epoch":{
"enabled": true,
"epoch": 0 // The version epoch, as an offset from the Unix epoch. This field will be excluded if enabled is false.
}
},
"client":{

View File

@ -608,7 +608,7 @@ int main(int argc, char** argv) {
setupNetwork(0, UseMetrics::True);
TraceEvent::setNetworkThread();
openTraceFile(NetworkAddress(), 10 << 20, 10 << 20, param.log_dir, "convert", param.trace_log_group);
openTraceFile({}, 10 << 20, 10 << 20, param.log_dir, "convert", param.trace_log_group);
auto f = stopAfter(convert(param));

View File

@ -641,7 +641,7 @@ int main(int argc, char** argv) {
param.updateKnobs();
TraceEvent::setNetworkThread();
openTraceFile(NetworkAddress(), 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
openTraceFile({}, 10 << 20, 500 << 20, param.log_dir, "decode", param.trace_log_group);
param.tlsConfig.setupBlobCredentials();
auto f = stopAfter(decode_logs(param));

View File

@ -2383,7 +2383,7 @@ ACTOR Future<Void> runRestore(Database db,
fmt::print("Restored to version {}\n", restoredVersion);
}
} else {
state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(targetVersion, db, ranges));
state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(targetVersion, ranges));
if (!rset.present()) {
fmt::print(stderr,
@ -2493,7 +2493,7 @@ ACTOR Future<Void> runFastRestoreTool(Database db,
restoreVersion = dbVersion;
}
state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(restoreVersion, db));
state Optional<RestorableFileSet> rset = wait(bc->getRestoreSet(restoreVersion));
if (!rset.present()) {
fmt::print(stderr, "Insufficient data to restore to version {}\n", restoreVersion);
throw restore_invalid_version();
@ -2768,7 +2768,7 @@ ACTOR Future<Void> queryBackup(const char* name,
format("the specified restorable version %lld is not valid", restoreVersion));
return Void();
}
Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter));
Optional<RestorableFileSet> fileSet = wait(bc->getRestoreSet(restoreVersion, keyRangesFilter));
if (fileSet.present()) {
int64_t totalRangeFilesSize = 0, totalLogFilesSize = 0;
result["restore_version"] = fileSet.get().targetVersion;
@ -3973,7 +3973,7 @@ int main(int argc, char* argv[]) {
// a cluster so they should use this instead.
auto initTraceFile = [&]() {
if (trace)
openTraceFile(NetworkAddress(), traceRollSize, traceMaxLogsSize, traceDir, "trace", traceLogGroup);
openTraceFile({}, traceRollSize, traceMaxLogsSize, traceDir, "trace", traceLogGroup);
};
auto initCluster = [&](bool quiet = false) {

View File

@ -47,11 +47,8 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
return UID();
}
Key begin, end;
if (tokens.size() == 2) {
begin = allKeys.begin;
end = allKeys.end;
} else if (tokens.size() == 3) {
Key begin = allKeys.begin, end = allKeys.end;
if (tokens.size() == 3) {
begin = tokens[2];
} else if (tokens.size() == 4) {
begin = tokens[2];
@ -66,7 +63,11 @@ ACTOR Future<UID> auditStorageCommandActor(Reference<IClusterConnectionRecord> c
}
CommandFactory auditStorageFactory("audit_storage",
CommandHelp("audit_storage <ha> [BeginKey] [EndKey]",
CommandHelp("audit_storage <Type> [BeginKey EndKey]",
"Start an audit storage",
"Trigger an audit storage, the auditID is returned.\n"));
"Specify audit `Type' (only `ha' `Type' is supported currently), and\n"
"optionally a sub-range with `BeginKey' and `EndKey'.\n"
"For example, to audit the full key range: `audit_storage ha'\n"
"To audit a sub-range only: `audit_storage ha 0xa 0xb'\n"
"Returns an audit `ID'. See also `get_audit_status' command.\n"));
} // namespace fdb_cli

View File

@ -24,6 +24,7 @@
#include "fdbclient/IClientApi.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/BlobGranuleRequest.actor.h"
#include "flow/Arena.h"
#include "flow/FastRef.h"
@ -88,6 +89,22 @@ ACTOR Future<Void> doBlobCheck(Database db, Key startKey, Key endKey, Optional<V
return Void();
}
ACTOR Future<Void> doBlobFlush(Database db, Key startKey, Key endKey, Optional<Version> version, bool compact) {
// TODO make DB function?
state Version flushVersion;
if (version.present()) {
flushVersion = version.get();
} else {
wait(store(flushVersion, getLatestReadVersion(db)));
}
KeyRange range(KeyRangeRef(startKey, endKey));
FlushGranuleRequest req(-1, range, flushVersion, compact);
wait(success(doBlobGranuleRequests(db, range, req, &BlobWorkerInterface::flushGranuleRequest)));
return Void();
}
} // namespace
namespace fdb_cli {
@ -147,7 +164,8 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
tokens[3].printable());
}
return success;
} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check")) {
} else if (tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge") || tokencmp(tokens[1], "check") ||
tokencmp(tokens[1], "flush") || tokencmp(tokens[1], "compact")) {
bool purge = tokencmp(tokens[1], "purge") || tokencmp(tokens[1], "forcepurge");
bool forcePurge = tokencmp(tokens[1], "forcepurge");
@ -175,7 +193,15 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
if (purge) {
wait(doBlobPurge(localDb, begin, end, version, forcePurge));
} else {
wait(doBlobCheck(localDb, begin, end, version));
if (tokencmp(tokens[1], "check")) {
wait(doBlobCheck(localDb, begin, end, version));
} else if (tokencmp(tokens[1], "flush")) {
wait(doBlobFlush(localDb, begin, end, version, false));
} else if (tokencmp(tokens[1], "compact")) {
wait(doBlobFlush(localDb, begin, end, version, true));
} else {
ASSERT(false);
}
}
} else {
printUsage(tokens[0]);
@ -187,5 +213,5 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
CommandFactory blobRangeFactory(
"blobrange",
CommandHelp("blobrange <start|stop|check|purge|forcepurge> <startkey> <endkey> [version]", "", ""));
CommandHelp("blobrange <start|stop|check|purge|forcepurge|flush|compact> <startkey> <endkey> [version]", "", ""));
} // namespace fdb_cli

View File

@ -58,4 +58,10 @@ if (NOT WIN32 AND NOT OPEN_FOR_IDE)
5
--external-client-library ${CMAKE_BINARY_DIR}/bindings/c/libfdb_c_external.so
)
add_multi_fdbclient_test(
NAME metacluster_fdbcli_tests
COMMAND ${CMAKE_SOURCE_DIR}/fdbcli/tests/metacluster_fdbcli_tests.py
${CMAKE_BINARY_DIR}
)
endif()

View File

@ -21,7 +21,9 @@
#include "fdbcli/FlowLineNoise.h"
#include "flow/IThreadPool.h"
#ifndef BOOST_SYSTEM_NO_LIB
#define BOOST_SYSTEM_NO_LIB
#endif
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include "boost/asio.hpp"

View File

@ -31,7 +31,7 @@
namespace fdb_cli {
ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef> tokens) {
if (tokens.size() != 4) {
if (tokens.size() < 3 || tokens.size() > 4) {
printUsage(tokens[0]);
return false;
}
@ -45,11 +45,18 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef
}
if (tokencmp(tokens[2], "id")) {
if (tokens.size() != 4) {
printUsage(tokens[0]);
return false;
}
const UID id = UID::fromString(tokens[3].toString());
AuditStorageState res = wait(getAuditState(cx, type, id));
printf("Audit result is:\n%s", res.toString().c_str());
} else if (tokencmp(tokens[2], "recent")) {
const int count = std::stoi(tokens[3].toString());
int count = CLIENT_KNOBS->TOO_MANY;
if (tokens.size() == 4) {
count = std::stoi(tokens[3].toString());
}
std::vector<AuditStorageState> res = wait(getLatestAuditStates(cx, type, count));
for (const auto& it : res) {
printf("Audit result is:\n%s\n", it.toString().c_str());
@ -60,8 +67,15 @@ ACTOR Future<bool> getAuditStatusCommandActor(Database cx, std::vector<StringRef
CommandFactory getAuditStatusFactory(
"get_audit_status",
CommandHelp("get_audit_status <ha> <id|recent> [ARGs]",
"Retrieve audit storage results of the specific type",
"Fetch audit result with an ID: get_audit_status [Type] id [ID];\n"
"Fetch most recent audit results: get_audit_status [Type] recent [Count].\n"));
CommandHelp("get_audit_status <Type> <id|recent> [ARGs]",
"Retrieve audit storage status",
"To fetch audit status via ID: `get_audit_status [Type] id [ID]'\n"
"To fetch status of most recent audit: `get_audit_status [Type] recent [Count]'\n"
"Only 'ha' `Type' is supported currently. If specified, `Count' is how many\n"
"rows to audit. If not specified, check all rows in audit.\n"
"Results have the following format:\n"
" `[ID]: 000000000001000000000000, [Range]: - 0xff, [Type]: 1, [Phase]: 2'\n"
"where `Type' is `1' for `ha' and `Phase' is `2' for `Complete'.\n"
"Phase can be `Invalid=0', `Running=1', `Complete=2', `Error=3', or `Failed=4'.\n"
"See also `audit_storage' command."));
} // namespace fdb_cli

View File

@ -252,8 +252,8 @@ ACTOR Future<bool> metaclusterGetCommand(Reference<IDatabase> db, std::vector<St
if (useJson) {
json_spirit::mObject obj;
obj["type"] = "success";
obj["cluster"] = metadata.toJson();
obj[msgTypeKey] = "success";
obj[msgClusterKey] = metadata.toJson();
fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
} else {
fmt::print(" connection string: {}\n", metadata.connectionString.toString().c_str());
@ -264,8 +264,8 @@ ACTOR Future<bool> metaclusterGetCommand(Reference<IDatabase> db, std::vector<St
} catch (Error& e) {
if (useJson) {
json_spirit::mObject obj;
obj["type"] = "error";
obj["error"] = e.what();
obj[msgTypeKey] = "error";
obj[msgErrorKey] = e.what();
fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
return false;
} else {
@ -287,39 +287,85 @@ ACTOR Future<bool> metaclusterStatusCommand(Reference<IDatabase> db, std::vector
state bool useJson = tokens.size() == 3;
try {
std::map<ClusterName, DataClusterMetadata> clusters =
wait(MetaclusterAPI::listClusters(db, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
state Optional<std::string> metaclusterName;
auto capacityNumbers = MetaclusterAPI::metaclusterCapacity(clusters);
state Reference<ITransaction> tr = db->createTransaction();
if (useJson) {
json_spirit::mObject obj;
obj["type"] = "success";
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
Optional<MetaclusterRegistrationEntry> registrationEntry =
wait(MetaclusterMetadata::metaclusterRegistration().get(tr));
const ClusterType clusterType =
!registrationEntry.present() ? ClusterType::STANDALONE : registrationEntry.get().clusterType;
if (ClusterType::STANDALONE == clusterType) {
if (useJson) {
json_spirit::mObject obj;
obj[msgTypeKey] = "success";
obj[msgClusterTypeKey] = clusterTypeToString(clusterType);
fmt::print("{}\n",
json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
} else {
fmt::print("This cluster is not part of a metacluster\n");
}
return true;
} else if (ClusterType::METACLUSTER_DATA == clusterType) {
ASSERT(registrationEntry.present());
metaclusterName = registrationEntry.get().metaclusterName.toString();
if (useJson) {
json_spirit::mObject obj;
obj[msgTypeKey] = "success";
obj[msgClusterTypeKey] = clusterTypeToString(clusterType);
json_spirit::mObject metaclusterObj;
metaclusterObj[msgMetaclusterName] = metaclusterName.get();
obj[msgMetaclusterKey] = metaclusterObj;
fmt::print("{}\n",
json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
} else {
fmt::print("This cluster \"{}\" is a data cluster within the metacluster named \"{}\"\n",
registrationEntry.get().name.toString().c_str(),
metaclusterName.get().c_str());
}
return true;
}
json_spirit::mObject metaclusterObj;
metaclusterObj["data_clusters"] = (int)clusters.size();
metaclusterObj["capacity"] = capacityNumbers.first.toJson();
metaclusterObj["allocated"] = capacityNumbers.second.toJson();
metaclusterName = registrationEntry.get().metaclusterName.toString();
obj["metacluster"] = metaclusterObj;
fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
} else {
fmt::print(" number of data clusters: {}\n", clusters.size());
fmt::print(" tenant group capacity: {}\n", capacityNumbers.first.numTenantGroups);
fmt::print(" allocated tenant groups: {}\n", capacityNumbers.second.numTenantGroups);
}
ASSERT(ClusterType::METACLUSTER_MANAGEMENT == clusterType);
std::map<ClusterName, DataClusterMetadata> clusters =
wait(MetaclusterAPI::listClustersTransaction(tr, ""_sr, "\xff"_sr, CLIENT_KNOBS->MAX_DATA_CLUSTERS));
auto capacityNumbers = MetaclusterAPI::metaclusterCapacity(clusters);
if (useJson) {
json_spirit::mObject obj;
obj[msgTypeKey] = "success";
obj[msgClusterTypeKey] = clusterTypeToString(ClusterType::METACLUSTER_MANAGEMENT);
return true;
} catch (Error& e) {
if (useJson) {
json_spirit::mObject obj;
obj["type"] = "error";
obj["error"] = e.what();
fmt::print("{}\n", json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
return false;
} else {
throw;
json_spirit::mObject metaclusterObj;
metaclusterObj[msgMetaclusterName] = metaclusterName.get();
metaclusterObj[msgDataClustersKey] = static_cast<int>(clusters.size());
metaclusterObj[msgCapacityKey] = capacityNumbers.first.toJson();
metaclusterObj[msgAllocatedKey] = capacityNumbers.second.toJson();
obj[msgMetaclusterKey] = metaclusterObj;
fmt::print("{}\n",
json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
} else {
fmt::print(" number of data clusters: {}\n", clusters.size());
fmt::print(" tenant group capacity: {}\n", capacityNumbers.first.numTenantGroups);
fmt::print(" allocated tenant groups: {}\n", capacityNumbers.second.numTenantGroups);
}
return true;
} catch (Error& e) {
if (useJson) {
json_spirit::mObject obj;
obj[msgTypeKey] = "error";
obj[msgErrorKey] = e.what();
fmt::print("{}\n",
json_spirit::write_string(json_spirit::mValue(obj), json_spirit::pretty_print).c_str());
return false;
} else {
throw;
}
}
}
}

View File

@ -365,10 +365,18 @@ ACTOR Future<bool> tenantListCommand(Reference<IDatabase> db, std::vector<String
state ClusterType clusterType = wait(TenantAPI::getClusterType(tr));
state std::vector<TenantName> tenantNames;
if (clusterType == ClusterType::METACLUSTER_MANAGEMENT) {
std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
wait(MetaclusterAPI::listTenants(db, beginTenant, endTenant, limit, offset, filters));
for (auto tenant : tenants) {
tenantNames.push_back(tenant.first);
if (filters.empty()) {
std::vector<std::pair<TenantName, int64_t>> tenants =
wait(MetaclusterAPI::listTenants(db, beginTenant, endTenant, limit, offset));
for (auto tenant : tenants) {
tenantNames.push_back(tenant.first);
}
} else {
std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
wait(MetaclusterAPI::listTenantMetadata(db, beginTenant, endTenant, limit, offset, filters));
for (auto tenant : tenants) {
tenantNames.push_back(tenant.first);
}
}
} else {
// Hold the reference to the standalone's memory

View File

@ -39,6 +39,16 @@
namespace fdb_cli {
constexpr char msgTypeKey[] = "type";
constexpr char msgClusterKey[] = "cluster";
constexpr char msgClusterTypeKey[] = "cluster_type";
constexpr char msgMetaclusterName[] = "metacluster_name";
constexpr char msgMetaclusterKey[] = "metacluster";
constexpr char msgDataClustersKey[] = "data_clusters";
constexpr char msgCapacityKey[] = "capacity";
constexpr char msgAllocatedKey[] = "allocated";
constexpr char msgErrorKey[] = "error";
struct CommandHelp {
std::string usage;
std::string short_desc;

View File

@ -1077,7 +1077,7 @@ if __name__ == '__main__':
description="""
The test calls fdbcli commands through fdbcli --exec "<command>" interactively using subprocess.
The outputs from fdbcli are returned and compared to predefined results.
Consequently, changing fdbcli outputs or breaking any commands will casue the test to fail.
Consequently, changing fdbcli outputs or breaking any commands will cause the test to fail.
Commands that are easy to test will run against a single process cluster.
For complex commands like exclude, they will run against a cluster with multiple(current set to 5) processes.
If external_client_library is given, we will disable the local client and use the external client to run fdbcli.

View File

@ -0,0 +1,95 @@
#!/usr/bin/env python3
import argparse
import os
import subprocess
from argparse import RawDescriptionHelpFormatter
def run_command(*args):
commands = ["{}".format(args)]
print(commands)
try:
process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
return process.stdout.decode('utf-8').strip()
except subprocess.TimeoutExpired:
raise Exception('the command is stuck')
def run_fdbcli_command(cluster_file, *args):
command_template = [fdbcli_bin, '-C', "{}".format(cluster_file), '--exec']
commands = command_template + ["{}".format(' '.join(args))]
print(commands)
try:
# if the fdbcli command is stuck for more than 20 seconds, the database is definitely unavailable
process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
return process.stdout.decode('utf-8').strip()
except subprocess.TimeoutExpired:
raise Exception('The fdbcli command is stuck, database is unavailable')
def get_cluster_connection_str(cluster_file_path):
with open(cluster_file_path, 'r') as f:
conn_str = f.readline().strip()
return conn_str
def metacluster_create(cluster_file, name):
return run_fdbcli_command(cluster_file, "metacluster create_experimental", name)
def metacluster_register(management_cluster_file, data_cluster_file, name):
conn_str = get_cluster_connection_str(data_cluster_file)
return run_fdbcli_command(management_cluster_file, "metacluster register", name, "connection_string={}".format(
conn_str))
def metacluster_status(cluster_file):
return run_fdbcli_command(cluster_file, "metacluster status")
if __name__ == "__main__":
print("metacluster_fdbcli_tests")
script_desc = """
This script executes a series of commands on multiple clusters within an FDB metacluster.
"""
parser = argparse.ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
description=script_desc)
parser.add_argument('build_dir', metavar='BUILD_DIRECTORY', help='FDB build directory')
args = parser.parse_args()
# keep current environment variables
fdbcli_env = os.environ.copy()
cluster_files = fdbcli_env.get("FDB_CLUSTERS").split(';')
num_clusters = len(cluster_files)
assert len(cluster_files) > 1
fdbcli_bin = args.build_dir + '/bin/fdbcli'
for cf in cluster_files:
output = metacluster_status(cf)
assert output == "This cluster is not part of a metacluster"
names = ['meta_mgmt']
names.extend(['data{}'.format(i) for i in range(1, num_clusters)])
metacluster_create(cluster_files[0], names[0])
for (cf, name) in zip(cluster_files[1:], names[1:]):
output = metacluster_register(cluster_files[0], cf, name)
expected = """
number of data clusters: {}
tenant group capacity: 0
allocated tenant groups: 0
"""
expected = expected.format(num_clusters - 1).strip()
output = metacluster_status(cluster_files[0])
assert expected == output
for (cf, name) in zip(cluster_files[1:], names[1:]):
output = metacluster_status(cf)
expected = "This cluster \"{}\" is a data cluster within the metacluster named \"{" \
"}\"".format(name, names[0])
assert expected == output

View File

@ -21,7 +21,9 @@
#include "flow/Platform.h"
#include <algorithm>
#ifndef BOOST_SYSTEM_NO_LIB
#define BOOST_SYSTEM_NO_LIB
#endif
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include "boost/asio.hpp"

View File

@ -24,6 +24,7 @@
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BlobCipher.h"
#include "fdbclient/CommitTransaction.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/GetEncryptCipherKeys.actor.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/ManagementAPI.actor.h"
@ -251,6 +252,34 @@ Version getLogKeyVersion(Key key) {
return bigEndian64(*(int64_t*)(key.begin() + backupLogPrefixBytes + sizeof(UID) + sizeof(uint8_t)));
}
bool validTenantAccess(std::map<int64_t, TenantName>* tenantMap,
MutationRef m,
bool provisionalProxy,
Version version) {
if (isSystemKey(m.param1)) {
return true;
}
int64_t tenantId = TenantInfo::INVALID_TENANT;
if (m.isEncrypted()) {
tenantId = m.encryptionHeader()->cipherTextDetails.encryptDomainId;
} else {
tenantId = TenantAPI::extractTenantIdFromMutation(m);
}
ASSERT(tenantMap != nullptr);
if (m.isEncrypted() && isReservedEncryptDomain(tenantId)) {
// These are valid encrypt domains so don't check the tenant map
} else if (tenantMap->find(tenantId) == tenantMap->end()) {
// If a tenant is not found for a given mutation then exclude it from the batch
ASSERT(!provisionalProxy);
TraceEvent(SevWarnAlways, "MutationLogRestoreTenantNotFound")
.detail("Version", version)
.detail("TenantId", tenantId);
CODE_PROBE(true, "mutation log restore tenant not found");
return false;
}
return true;
}
// Given a key from one of the ranges returned by get_log_ranges,
// returns(version, part) where version is the database version number of
// the transaction log data in the value, and part is 0 for the first such
@ -319,27 +348,49 @@ ACTOR static Future<Void> decodeBackupLogValue(Arena* arena,
offset += len2;
state Optional<MutationRef> encryptedLogValue = Optional<MutationRef>();
// Check for valid tenant in required tenant mode. If the tenant does not exist in our tenant map then
// we EXCLUDE the mutation (of that respective tenant) during the restore. NOTE: This simply allows a
// restore to make progress in the event of tenant deletion, but tenant deletion should be considered
// carefully so that we do not run into this case. We do this check here so if encrypted mutations are not
// found in the tenant map then we exit early without needing to reach out to the EKP.
if (config.tenantMode == TenantMode::REQUIRED &&
config.encryptionAtRestMode.mode != EncryptionAtRestMode::CLUSTER_AWARE &&
!validTenantAccess(tenantMap, logValue, provisionalProxy, version)) {
consumed += BackupAgentBase::logHeaderSize + len1 + len2;
continue;
}
// Decrypt mutation ref if encrypted
if (logValue.isEncrypted()) {
encryptedLogValue = logValue;
state EncryptCipherDomainId domainId = logValue.encryptionHeader()->cipherTextDetails.encryptDomainId;
Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
TextAndHeaderCipherKeys cipherKeys =
wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::BACKUP));
logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP);
try {
TextAndHeaderCipherKeys cipherKeys =
wait(getEncryptCipherKeys(dbInfo, *logValue.encryptionHeader(), BlobCipherMetrics::RESTORE));
logValue = logValue.decrypt(cipherKeys, tempArena, BlobCipherMetrics::BACKUP);
} catch (Error& e) {
// It's possible a tenant was deleted and the encrypt key fetch failed
TraceEvent(SevWarnAlways, "MutationLogRestoreEncryptKeyFetchFailed")
.detail("Version", version)
.detail("TenantId", domainId);
if (e.code() == error_code_encrypt_keys_fetch_failed) {
CODE_PROBE(true, "mutation log restore encrypt keys not found");
consumed += BackupAgentBase::logHeaderSize + len1 + len2;
continue;
} else {
throw;
}
}
}
ASSERT(!logValue.isEncrypted());
if (config.tenantMode == TenantMode::REQUIRED && !isSystemKey(logValue.param1)) {
// If a tenant is not found for a given mutation then exclude it from the batch
int64_t tenantId = TenantAPI::extractTenantIdFromMutation(logValue);
ASSERT(tenantMap != nullptr);
if (tenantMap->find(tenantId) == tenantMap->end()) {
ASSERT(!provisionalProxy);
TraceEvent("TenantNotFound").detail("Version", version).detail("TenantId", tenantId);
CODE_PROBE(true, "mutation log restore tenant not found");
consumed += BackupAgentBase::logHeaderSize + len1 + len2;
continue;
}
// If the mutation was encrypted using cluster aware encryption then check after decryption
if (config.tenantMode == TenantMode::REQUIRED &&
config.encryptionAtRestMode.mode == EncryptionAtRestMode::CLUSTER_AWARE &&
!validTenantAccess(tenantMap, logValue, provisionalProxy, version)) {
consumed += BackupAgentBase::logHeaderSize + len1 + len2;
continue;
}
MutationRef originalLogValue = logValue;

View File

@ -906,7 +906,6 @@ public:
ACTOR static Future<Optional<RestorableFileSet>> getRestoreSet(Reference<BackupContainerFileSystem> bc,
Version targetVersion,
VectorRef<KeyRangeRef> keyRangesFilter,
Optional<Database> cx,
bool logsOnly = false,
Version beginVersion = invalidVersion) {
for (const auto& range : keyRangesFilter) {
@ -974,19 +973,6 @@ public:
continue;
restorable.snapshot = snapshots[i];
// TODO: Reenable the sanity check after TooManyFiles error is resolved
if (false && g_network->isSimulated()) {
// Sanity check key ranges
state std::map<std::string, KeyRange>::iterator rit;
for (rit = restorable.keyRanges.begin(); rit != restorable.keyRanges.end(); rit++) {
auto it = std::find_if(restorable.ranges.begin(),
restorable.ranges.end(),
[file = rit->first](const RangeFile f) { return f.fileName == file; });
ASSERT(it != restorable.ranges.end());
KeyRange result = wait(bc->getSnapshotFileKeyRange(*it, cx));
ASSERT(rit->second.begin <= result.begin && rit->second.end >= result.end);
}
}
// No logs needed if there is a complete filtered key space snapshot at the target version.
if (minKeyRangeVersion == maxKeyRangeVersion && maxKeyRangeVersion == restorable.targetVersion) {
@ -1362,7 +1348,7 @@ Future<Void> BackupContainerFileSystem::expireData(Version expireEndVersion,
ACTOR static Future<KeyRange> getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem> bc,
RangeFile file,
Optional<Database> cx) {
Database cx) {
state int readFileRetries = 0;
state bool beginKeySet = false;
state Key beginKey;
@ -1448,18 +1434,17 @@ ACTOR static Future<Optional<Version>> readVersionProperty(Reference<BackupConta
}
}
Future<KeyRange> BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) {
Future<KeyRange> BackupContainerFileSystem::getSnapshotFileKeyRange(const RangeFile& file, Database cx) {
ASSERT(g_network->isSimulated());
return getSnapshotFileKeyRange_impl(Reference<BackupContainerFileSystem>::addRef(this), file, cx);
}
Future<Optional<RestorableFileSet>> BackupContainerFileSystem::getRestoreSet(Version targetVersion,
Optional<Database> cx,
VectorRef<KeyRangeRef> keyRangesFilter,
bool logsOnly,
Version beginVersion) {
return BackupContainerFileSystemImpl::getRestoreSet(
Reference<BackupContainerFileSystem>::addRef(this), targetVersion, keyRangesFilter, cx, logsOnly, beginVersion);
Reference<BackupContainerFileSystem>::addRef(this), targetVersion, keyRangesFilter, logsOnly, beginVersion);
}
Future<Optional<Version>> BackupContainerFileSystem::VersionProperty::get() {
@ -1687,8 +1672,7 @@ ACTOR static Future<Void> testWriteSnapshotFile(Reference<IBackupFile> file, Key
ACTOR Future<Void> testBackupContainer(std::string url,
Optional<std::string> proxy,
Optional<std::string> encryptionKeyFileName,
Optional<Database> cx) {
Optional<std::string> encryptionKeyFileName) {
state FlowLock lock(100e6);
if (encryptionKeyFileName.present()) {
@ -1795,13 +1779,13 @@ ACTOR Future<Void> testBackupContainer(std::string url,
for (; i < listing.snapshots.size(); ++i) {
{
// Ensure we can still restore to the latest version
Optional<RestorableFileSet> rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get(), cx));
Optional<RestorableFileSet> rest = wait(c->getRestoreSet(desc.maxRestorableVersion.get()));
ASSERT(rest.present());
}
{
// Ensure we can restore to the end version of snapshot i
Optional<RestorableFileSet> rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion, cx));
Optional<RestorableFileSet> rest = wait(c->getRestoreSet(listing.snapshots[i].endVersion));
ASSERT(rest.present());
}
@ -1842,16 +1826,14 @@ ACTOR Future<Void> testBackupContainer(std::string url,
}
TEST_CASE("/backup/containers/localdir/unencrypted") {
wait(testBackupContainer(
format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}, {}));
wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()), {}, {}));
return Void();
}
TEST_CASE("/backup/containers/localdir/encrypted") {
wait(testBackupContainer(format("file://%s/fdb_backups/%llx", params.getDataDir().c_str(), timer_int()),
{},
format("%s/test_encryption_key", params.getDataDir().c_str()),
{}));
format("%s/test_encryption_key", params.getDataDir().c_str())));
return Void();
}
@ -1859,7 +1841,7 @@ TEST_CASE("/backup/containers/url") {
if (!g_network->isSimulated()) {
const char* url = getenv("FDB_TEST_BACKUP_URL");
ASSERT(url != nullptr);
wait(testBackupContainer(url, {}, {}, {}));
wait(testBackupContainer(url, {}, {}));
}
return Void();
}

View File

@ -85,6 +85,7 @@ BlobCipherMetrics::BlobCipherMetrics()
CounterSet(cc, "KVRedwood"),
CounterSet(cc, "BlobGranule"),
CounterSet(cc, "Backup"),
CounterSet(cc, "Restore"),
CounterSet(cc, "Test") }) {
specialCounter(cc, "CacheSize", []() { return BlobCipherKeyCache::getInstance()->getSize(); });
traceFuture = cc.traceCounters("BlobCipherMetrics", UID(), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL);
@ -102,6 +103,8 @@ std::string toString(BlobCipherMetrics::UsageType type) {
return "BlobGranule";
case BlobCipherMetrics::UsageType::BACKUP:
return "Backup";
case BlobCipherMetrics::UsageType::RESTORE:
return "Restore";
case BlobCipherMetrics::UsageType::TEST:
return "Test";
default:

View File

@ -1479,7 +1479,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[],
const std::vector<StringRef>& deltaFileData,
GranuleMaterializeStats& stats) {
// TODO REMOVE with early replying
ASSERT(readVersion == chunk.includedVersion);
@ -1528,6 +1528,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
if (BG_READ_DEBUG) {
fmt::print("Applying {} delta files\n", chunk.deltaFiles.size());
}
ASSERT(chunk.deltaFiles.size() == deltaFileData.size());
for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) {
stats.inputBytes += deltaFileData[deltaIdx].size();
bool startClear = false;
@ -1656,8 +1657,8 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
}
// +1 to avoid UBSAN variable length array of size zero
StringRef deltaData[files[chunkIdx].deltaFiles.size() + 1];
std::vector<StringRef> deltaData;
deltaData.resize(files[chunkIdx].deltaFiles.size());
for (int i = 0; i < files[chunkIdx].deltaFiles.size(); i++) {
deltaData[i] =
StringRef(granuleContext.get_load_f(loadIds[chunkIdx].deltaIds[i], granuleContext.userContext),
@ -1684,6 +1685,85 @@ ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<B
}
}
// just for client passthrough. reads all key-value pairs from a snapshot file, and all mutations from a delta file
RangeResult bgReadSnapshotFile(const StringRef& data) {
Standalone<StringRef> fname = "f"_sr;
Standalone<VectorRef<ParsedDeltaBoundaryRef>> results = loadSnapshotFile(fname, data, normalKeys, {});
RangeResult snapshot;
snapshot.reserve(snapshot.arena(), results.size());
snapshot.arena().dependsOn(results.arena());
for (auto& it : results) {
snapshot.emplace_back(snapshot.arena(), it.key, it.value);
}
return snapshot;
}
// FIXME: refactor if possible, just copy-pasted from loadChunkedDeltaFile for prototyping
Standalone<VectorRef<GranuleMutationRef>> bgReadDeltaFile(const StringRef& deltaData) {
Standalone<VectorRef<GranuleMutationRef>> deltas;
Standalone<IndexedBlobGranuleFile> file = IndexedBlobGranuleFile::fromFileBytes(deltaData, {});
ASSERT(file.fileType == DELTA_FILE_TYPE);
ASSERT(file.chunkStartOffset > 0);
// empty delta file
if (file.indexBlockRef.block.children.empty()) {
return deltas;
}
ASSERT(file.indexBlockRef.block.children.size() >= 2);
// find range of blocks needed to read
ChildBlockPointerRef* currentBlock = file.indexBlockRef.block.children.begin();
bool lastBlock = false;
bool prevClearAfter = false;
KeyRef prevClearAfterKey;
Version prevClearAfterVersion;
while (!lastBlock) {
auto nextBlock = currentBlock;
nextBlock++;
lastBlock = (nextBlock == file.indexBlockRef.block.children.end() - 1);
Standalone<GranuleSortedDeltas> deltaBlock =
file.getChild<GranuleSortedDeltas>(currentBlock, {}, file.chunkStartOffset);
ASSERT(!deltaBlock.boundaries.empty());
ASSERT(currentBlock->key == deltaBlock.boundaries.front().key);
for (auto& entry : deltaBlock.boundaries) {
if (prevClearAfter) {
deltas.emplace_back(
deltas.arena(), MutationRef::Type::ClearRange, prevClearAfterVersion, prevClearAfterKey, entry.key);
}
prevClearAfter = entry.clearVersion.present();
if (prevClearAfter) {
prevClearAfterVersion = entry.clearVersion.get();
prevClearAfterKey = entry.key;
}
for (auto& v : entry.values) {
if (v.op == MutationRef::Type::ClearRange) {
if (entry.clearVersion.present() && v.version == entry.clearVersion.get()) {
// we'll handle that in the next loop with prevClearAfter
continue;
}
deltas.emplace_back(deltas.arena(),
MutationRef::Type::ClearRange,
v.version,
entry.key,
keyAfter(entry.key, deltas.arena()));
} else {
ASSERT(v.op == MutationRef::Type::SetValue);
deltas.emplace_back(deltas.arena(), MutationRef::Type::SetValue, v.version, entry.key, v.value);
}
}
}
deltas.arena().dependsOn(deltaBlock.arena());
currentBlock++;
}
return deltas;
}
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix) {
// Start with random bytes to avoid metadata hotspotting
// Worker ID for uniqueness and attribution
@ -2359,7 +2439,9 @@ void checkDeltaRead(const KeyValueGen& kvGen,
Version beginVersion,
Version readVersion,
const Standalone<GranuleDeltas>& data,
StringRef* serialized) {
const std::vector<StringRef>& serialized) {
ASSERT_EQ(serialized.size(), 1);
// expected answer
std::map<KeyRef, ValueRef> expectedData;
Version lastFileEndVersion = 0;
@ -2378,7 +2460,7 @@ void checkDeltaRead(const KeyValueGen& kvGen,
deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta");
Standalone<BlobGranuleChunkRef> chunk;
chunk.deltaFiles.emplace_back_deep(
chunk.arena(), filename, 0, serialized->size(), serialized->size(), kvGen.cipherKeys);
chunk.arena(), filename, 0, serialized[0].size(), serialized[0].size(), kvGen.cipherKeys);
chunk.keyRange = kvGen.allRange;
chunk.includedVersion = readVersion;
chunk.snapshotVersion = invalidVersion;
@ -2459,13 +2541,14 @@ TEST_CASE("/blobgranule/files/deltaFormatUnitTest") {
}*/
Value serialized = serializeChunkedDeltaFile(
fileNameRef, data, kvGen.allRange, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys);
std::vector<StringRef> deltaPtr{ serialized };
// check whole file
checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, &serialized);
checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, deltaPtr);
for (int i = 0; i < std::min((size_t)100, kvGen.usedKeysList.size() * data.size()); i++) {
auto params = randomizeKeyAndVersions(kvGen, data);
checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, &serialized);
checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, deltaPtr);
}
return Void();
@ -2518,10 +2601,6 @@ void checkGranuleRead(const KeyValueGen& kvGen,
}
deltaIdx++;
}
StringRef deltaPtrs[deltaPtrsVector.size() + 1];
for (int i = 0; i < deltaPtrsVector.size(); i++) {
deltaPtrs[i] = deltaPtrsVector[i];
}
// add in memory deltas
chunk.arena().dependsOn(inMemoryDeltas.arena());
@ -2540,7 +2619,7 @@ void checkGranuleRead(const KeyValueGen& kvGen,
snapshotPtr = serializedSnapshot;
}
RangeResult actualData =
materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs, stats);
materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrsVector, stats);
if (expectedData.size() != actualData.size()) {
fmt::print("Expected Size {0} != Actual Size {1}\n", expectedData.size(), actualData.size());
@ -2684,6 +2763,87 @@ TEST_CASE("/blobgranule/files/granuleReadUnitTest") {
return Void();
}
namespace {
MutationsAndVersionRef singleMutation(Version v,
MutationRef::Type type,
Arena& ar,
const StringRef& param1,
const StringRef& param2) {
MutationsAndVersionRef ref(v, v);
ref.mutations.emplace_back(ar, type, param1, param2);
return ref;
}
void checkMutations(const Standalone<VectorRef<GranuleMutationRef>>& expected,
const Standalone<VectorRef<GranuleMutationRef>>& actual) {
ASSERT(expected.size() == actual.size());
for (int i = 0; i < expected.size(); i++) {
ASSERT(expected[i].version == actual[i].version);
ASSERT(expected[i].type == actual[i].type);
ASSERT(expected[i].param1 == actual[i].param1);
ASSERT(expected[i].param2 == actual[i].param2);
}
}
} // namespace
/*
Input mutations:
Set A=5 @ 100
Clear [A - C) @ 200
Set E=6 @ 300
Set A=7 @ 400
Clear [A - E) @ 500
Clear [E - E\x00) @ 600 (single key clear)
Output mutations:
Set A=5 @ 100
Set A=7 @ 400
Clear [A - A\x00) @ 500
Clear [A - C) @ 200
Clear [C - C\x00] @ 500
Set E=6 @ 300
Clear [E - E\x00) @ 600
*/
TEST_CASE("/blobgranule/files/bgReadDeltaFile") {
Arena ar;
Standalone<StringRef> strA = "A"_sr;
Standalone<StringRef> strC = "C"_sr;
Standalone<StringRef> strE = "E"_sr;
Standalone<StringRef> str5 = "5"_sr;
Standalone<StringRef> str6 = "6"_sr;
Standalone<StringRef> str7 = "7"_sr;
Standalone<StringRef> strAfterA = keyAfter(strA);
Standalone<StringRef> strAfterE = keyAfter(strE);
Standalone<GranuleDeltas> originalMutations;
originalMutations.push_back(ar, singleMutation(100, MutationRef::Type::SetValue, ar, strA, str5));
originalMutations.push_back(ar, singleMutation(200, MutationRef::Type::ClearRange, ar, strA, strC));
originalMutations.push_back(ar, singleMutation(300, MutationRef::Type::SetValue, ar, strE, str6));
originalMutations.push_back(ar, singleMutation(400, MutationRef::Type::SetValue, ar, strA, str7));
originalMutations.push_back(ar, singleMutation(500, MutationRef::Type::ClearRange, ar, strA, strE));
originalMutations.push_back(ar, singleMutation(600, MutationRef::Type::ClearRange, ar, strE, strAfterE));
Standalone<VectorRef<GranuleMutationRef>> expectedMutations;
expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 100, strA, str5);
expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 400, strA, str7);
expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 500, strA, strAfterA);
expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 200, strA, strC);
expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 500, strC, strE);
expectedMutations.emplace_back(ar, MutationRef::Type::SetValue, 300, strE, str6);
expectedMutations.emplace_back(ar, MutationRef::Type::ClearRange, 600, strE, strAfterE);
for (int chunkSize = 1; chunkSize <= 32 * 1024; chunkSize *= 2) {
Value serialized =
serializeChunkedDeltaFile(strA, originalMutations, KeyRangeRef(strA, strAfterE), chunkSize, {}, {});
Standalone<VectorRef<GranuleMutationRef>> actualMutations = bgReadDeltaFile(serialized);
checkMutations(expectedMutations, actualMutations);
}
return Void();
}
// performance micro-benchmarks
struct FileSet {
@ -2932,7 +3092,7 @@ std::pair<int64_t, double> doDeltaWriteBench(const Standalone<GranuleDeltas>& da
void chunkFromFileSet(const FileSet& fileSet,
Standalone<BlobGranuleChunkRef>& chunk,
StringRef* deltaPtrs,
std::vector<StringRef>& deltaPtrs,
Version readVersion,
Optional<BlobGranuleCipherKeysCtx> keys,
int numDeltaFiles) {
@ -2985,7 +3145,7 @@ std::pair<int64_t, double> doReadBench(const FileSet& fileSet,
Standalone<BlobGranuleChunkRef> chunk;
GranuleMaterializeStats stats;
ASSERT(numDeltaFiles >= 0 && numDeltaFiles <= fileSet.deltaFiles.size());
StringRef deltaPtrs[numDeltaFiles];
std::vector<StringRef> deltaPtrs(numDeltaFiles);
MutationRef clearAllAtEndMutation;
if (clearAllAtEnd) {

View File

@ -36,21 +36,12 @@ ACTOR Future<Standalone<StringRef>> readFile(Reference<BlobConnectionProvider> b
state Arena arena;
std::string fname = f.filename.toString();
state Reference<BackupContainerFileSystem> bstore = bstoreProvider->getForRead(fname);
// printf("Starting read of snapshot file %s\n", fname.c_str());
state Reference<IAsyncFile> reader = wait(bstore->readFile(fname));
// printf("Got snapshot file size %lld\n", size);
state uint8_t* data = new (arena) uint8_t[f.length];
// printf("Reading %lld bytes from snapshot file %s\n", size, filename.c_str());
state int lengthRemaining = f.length;
state int64_t blockOffset = f.offset;
while (lengthRemaining > 0) {
int blockSize = std::min(lengthRemaining, CLIENT_KNOBS->BGR_READ_BLOCK_SIZE);
int readSize = wait(reader->read(data + (blockOffset - f.offset), blockSize, blockOffset));
ASSERT(readSize <= lengthRemaining);
lengthRemaining -= readSize;
blockOffset += readSize;
}
state uint8_t* data = new (arena) uint8_t[f.length];
int readSize = wait(reader->read(data, f.length, f.offset));
ASSERT(f.length == readSize);
StringRef dataRef(data, f.length);
return Standalone<StringRef>(dataRef, arena);
@ -102,13 +93,13 @@ ACTOR Future<RangeResult> readBlobGranule(BlobGranuleChunkRef chunk,
}
state int numDeltaFiles = chunk.deltaFiles.size();
state StringRef* deltaData = new (arena) StringRef[numDeltaFiles];
state std::vector<StringRef> deltaData;
state int deltaIdx;
// for (Future<Standalone<StringRef>> deltaFuture : readDeltaFutures) {
deltaData.reserve(numDeltaFiles);
for (deltaIdx = 0; deltaIdx < numDeltaFiles; deltaIdx++) {
Standalone<StringRef> data = wait(readDeltaFutures[deltaIdx]);
deltaData[deltaIdx] = data;
deltaData.push_back(data);
arena.dependsOn(data.arena());
}

View File

@ -224,6 +224,11 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 );
init( BLOBSTORE_MULTIPART_MAX_PART_SIZE, 20000000 );
init( BLOBSTORE_MULTIPART_MIN_PART_SIZE, 5242880 );
init( BLOBSTORE_GLOBAL_CONNECTION_POOL, true );
init( BLOBSTORE_ENABLE_LOGGING, true );
init( BLOBSTORE_STATS_LOGGING_INTERVAL, 10.0 );
init( BLOBSTORE_LATENCY_LOGGING_INTERVAL, 120.0 );
init( BLOBSTORE_LATENCY_LOGGING_ACCURACY, 0.01 );
// These are basically unlimited by default but can be used to reduce blob IO if needed
init( BLOBSTORE_REQUESTS_PER_SECOND, 200 );
@ -235,8 +240,6 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_READ_REQUESTS_PER_SECOND, 100 );
init( BLOBSTORE_DELETE_REQUESTS_PER_SECOND, 200 );
init( BGR_READ_BLOCK_SIZE, 20*1024*1024 ); if( randomize && BUGGIFY ) BGR_READ_BLOCK_SIZE = 64 * 1024 * deterministicRandom()->randomInt(1, 100);
// Dynamic Knobs
init( COMMIT_QUORUM_TIMEOUT, 3.0 );
init( GET_GENERATION_QUORUM_TIMEOUT, 3.0 );
@ -280,7 +283,7 @@ void ClientKnobs::initialize(Randomize randomize) {
// Blob granules
init( BG_MAX_GRANULE_PARALLELISM, 10 );
init( BG_TOO_MANY_GRANULES, 10000 );
init( BG_TOO_MANY_GRANULES, 20000 );
init( BLOB_METADATA_REFRESH_INTERVAL, 3600 ); if ( randomize && BUGGIFY ) { BLOB_METADATA_REFRESH_INTERVAL = deterministicRandom()->randomInt(5, 120); }
init( CHANGE_QUORUM_BAD_STATE_RETRY_TIMES, 3 );
@ -298,6 +301,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( CLIENT_ENABLE_USING_CLUSTER_ID_KEY, false );
init( ENABLE_ENCRYPTION_CPU_TIME_LOGGING, false );
init( SIMULATION_EKP_TENANT_IDS_TO_DROP, "-1" );
// clang-format on
}

View File

@ -19,6 +19,11 @@
*/
#include "fdbclient/DatabaseConfiguration.h"
#include "fdbclient/TenantEntryCache.actor.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbrpc/TenantInfo.h"
#include "fdbrpc/simulator.h"
#include "flow/FastRef.h"
#include "fmt/format.h"
#include "fdbclient/BackupAgent.actor.h"
#include "fdbclient/BackupContainer.h"
@ -606,7 +611,7 @@ struct EncryptedRangeFileWriter : public IRangeFileWriter {
int64_t dataLen,
Arena* arena) {
Reference<AsyncVar<ClientDBInfo> const> dbInfo = cx->clientInfo;
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::BACKUP));
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, header, BlobCipherMetrics::RESTORE));
ASSERT(cipherKeys.cipherHeaderKey.isValid() && cipherKeys.cipherTextKey.isValid());
validateEncryptionHeader(cipherKeys.cipherHeaderKey, cipherKeys.cipherTextKey, header);
DecryptBlobCipherAes256Ctr decryptor(
@ -1025,8 +1030,9 @@ private:
ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
Standalone<VectorRef<KeyValueRef>>* results,
bool encryptedBlock,
Optional<EncryptionAtRestMode> encryptMode,
Optional<BlobCipherEncryptHeader> encryptHeader) {
EncryptionAtRestMode encryptMode,
Optional<BlobCipherEncryptHeader> encryptHeader,
Optional<Reference<TenantEntryCache<Void>>> tenantCache) {
// Read begin key, if this fails then block was invalid.
state uint32_t kLen = reader->consumeNetworkUInt32();
state const uint8_t* k = reader->consume(kLen);
@ -1044,16 +1050,15 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
// make sure that all keys in a block belong to exactly one tenant,
// unless its the last key in which case it can be a truncated (different) tenant prefix
if (encryptedBlock && g_network && g_network->isSimulated()) {
ASSERT(encryptMode.present());
ASSERT(encryptHeader.present());
state KeyRef curKey = KeyRef(k, kLen);
if (!prevDomainId.present()) {
EncryptCipherDomainId domainId =
EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, encryptMode.get());
EncryptedRangeFileWriter::getEncryptionDomainDetails(prevKey, encryptMode);
prevDomainId = domainId;
}
EncryptCipherDomainId curDomainId =
EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, encryptMode.get());
EncryptedRangeFileWriter::getEncryptionDomainDetails(curKey, encryptMode);
if (!curKey.empty() && !prevKey.empty() && prevDomainId.get() != curDomainId) {
ASSERT(!done);
if (curDomainId != SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID && curDomainId != FDB_DEFAULT_ENCRYPT_DOMAIN_ID) {
@ -1076,9 +1081,22 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
}
// Read a value, which must exist or the block is invalid
uint32_t vLen = reader->consumeNetworkUInt32();
const uint8_t* v = reader->consume(vLen);
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
state uint32_t vLen = reader->consumeNetworkUInt32();
state const uint8_t* v = reader->consume(vLen);
if (tenantCache.present() && !isSystemKey(KeyRef(k, kLen))) {
state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(StringRef(k, kLen));
Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(tenantId));
// The first and last KV pairs are not restored so if the tenant is not found for the last key then it's ok
// to include it in the restore set
if (!payload.present() && !(reader->eof() || *reader->rptr == 0xFF)) {
TraceEvent(SevWarnAlways, "SnapshotRestoreTenantNotFound").detail("TenantId", tenantId);
CODE_PROBE(true, "Snapshot restore tenant not found");
} else {
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
}
} else {
results->push_back(results->arena(), KeyValueRef(KeyRef(k, kLen), ValueRef(v, vLen)));
}
// If eof reached or first byte of next key len is 0xFF then a valid block end was reached.
if (reader->eof() || *reader->rptr == 0xFF)
@ -1096,7 +1114,7 @@ ACTOR static Future<Void> decodeKVPairs(StringRefReader* reader,
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
int64_t offset,
int len,
Optional<Database> cx) {
Database cx) {
state Standalone<StringRef> buf = makeString(len);
int rLen = wait(uncancellable(holdWhile(buf, file->read(mutateString(buf), len, offset))));
if (rLen != len)
@ -1107,19 +1125,26 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
state Standalone<VectorRef<KeyValueRef>> results({}, buf.arena());
state StringRefReader reader(buf, restore_corrupted_data());
state Arena arena;
state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
if (config.tenantMode == TenantMode::REQUIRED) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache.get()->init());
}
state EncryptionAtRestMode encryptMode = config.encryptionAtRestMode;
state int64_t blockTenantId = TenantInfo::INVALID_TENANT;
try {
// Read header, currently only decoding BACKUP_AGENT_SNAPSHOT_FILE_VERSION or
// BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION
int32_t file_version = reader.consume<int32_t>();
if (file_version == BACKUP_AGENT_SNAPSHOT_FILE_VERSION) {
wait(decodeKVPairs(
&reader, &results, false, Optional<EncryptionAtRestMode>(), Optional<BlobCipherEncryptHeader>()));
wait(
decodeKVPairs(&reader, &results, false, encryptMode, Optional<BlobCipherEncryptHeader>(), tenantCache));
} else if (file_version == BACKUP_AGENT_ENCRYPTED_SNAPSHOT_FILE_VERSION) {
CODE_PROBE(true, "decoding encrypted block");
ASSERT(cx.present());
// decode options struct
uint32_t optionsLen = reader.consumeNetworkUInt32();
state uint32_t optionsLen = reader.consumeNetworkUInt32();
const uint8_t* o = reader.consume(optionsLen);
StringRef optionsStringRef = StringRef(o, optionsLen);
EncryptedRangeFileWriter::Options options =
@ -1127,29 +1152,38 @@ ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<
ASSERT(!options.compressionEnabled);
// read encryption header
const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
state const uint8_t* headerStart = reader.consume(BlobCipherEncryptHeader::headerSize);
StringRef headerS = StringRef(headerStart, BlobCipherEncryptHeader::headerSize);
state BlobCipherEncryptHeader header = BlobCipherEncryptHeader::fromStringRef(headerS);
blockTenantId = header.cipherTextDetails.encryptDomainId;
if (config.tenantMode == TenantMode::REQUIRED && !isReservedEncryptDomain(blockTenantId)) {
ASSERT(tenantCache.present());
Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache.get()->getById(blockTenantId));
if (!payload.present()) {
throw tenant_not_found();
}
}
const uint8_t* dataPayloadStart = headerStart + BlobCipherEncryptHeader::headerSize;
// calculate the total bytes read up to (and including) the header
int64_t bytesRead = sizeof(int32_t) + sizeof(uint32_t) + optionsLen + BlobCipherEncryptHeader::headerSize;
// get the size of the encrypted payload and decrypt it
int64_t dataLen = len - bytesRead;
StringRef decryptedData =
wait(EncryptedRangeFileWriter::decrypt(cx.get(), header, dataPayloadStart, dataLen, &results.arena()));
wait(EncryptedRangeFileWriter::decrypt(cx, header, dataPayloadStart, dataLen, &results.arena()));
reader = StringRefReader(decryptedData, restore_corrupted_data());
state Optional<EncryptionAtRestMode> encryptMode;
if (g_network && g_network->isSimulated()) {
// The encyption mode is only used during simulation for a sanity check
DatabaseConfiguration config = wait(getDatabaseConfiguration(cx.get()));
encryptMode = config.encryptionAtRestMode;
}
wait(decodeKVPairs(&reader, &results, true, encryptMode, header));
wait(decodeKVPairs(&reader, &results, true, encryptMode, header, tenantCache));
} else {
throw restore_unsupported_file_version();
}
return results;
} catch (Error& e) {
if (e.code() == error_code_encrypt_keys_fetch_failed) {
TraceEvent(SevWarnAlways, "SnapshotRestoreEncryptKeyFetchFailed").detail("TenantId", blockTenantId);
CODE_PROBE(true, "Snapshot restore encrypt keys not found");
} else if (e.code() == error_code_tenant_not_found) {
TraceEvent(SevWarnAlways, "EncryptedSnapshotRestoreTenantNotFound").detail("TenantId", blockTenantId);
CODE_PROBE(true, "Encrypted Snapshot restore tenant not found");
}
TraceEvent(SevWarn, "FileRestoreDecodeRangeFileBlockFailed")
.error(e)
.detail("Filename", file->getFilename())
@ -3529,6 +3563,16 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
return returnStr;
}
ACTOR static Future<Void> _validTenantAccess(KeyRef key, Reference<TenantEntryCache<Void>> tenantCache) {
if (isSystemKey(key)) {
return Void();
}
state int64_t tenantId = TenantAPI::extractTenantIdFromKeyRef(key);
Optional<TenantEntryCachePayload<Void>> payload = wait(tenantCache->getById(tenantId));
ASSERT(payload.present());
return Void();
}
ACTOR static Future<Void> _execute(Database cx,
Reference<TaskBucket> taskBucket,
Reference<FutureBucket> futureBucket,
@ -3577,8 +3621,25 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
}
state Reference<IAsyncFile> inFile = wait(bc.get()->readFile(rangeFile.fileName));
state Standalone<VectorRef<KeyValueRef>> blockData =
wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx));
state Standalone<VectorRef<KeyValueRef>> blockData;
try {
Standalone<VectorRef<KeyValueRef>> data = wait(decodeRangeFileBlock(inFile, readOffset, readLen, cx));
blockData = data;
} catch (Error& e) {
// It's possible a tenant was deleted and the encrypt key fetch failed
if (e.code() == error_code_encrypt_keys_fetch_failed || e.code() == error_code_tenant_not_found) {
return Void();
}
throw;
}
state Optional<Reference<TenantEntryCache<Void>>> tenantCache;
state std::vector<Future<Void>> validTenantCheckFutures;
state Arena arena;
state DatabaseConfiguration config = wait(getDatabaseConfiguration(cx));
if (config.tenantMode == TenantMode::REQUIRED && g_network && g_network->isSimulated()) {
tenantCache = makeReference<TenantEntryCache<Void>>(cx, TenantEntryCacheRefreshMode::WATCH);
wait(tenantCache.get()->init());
}
// First and last key are the range for this file
state KeyRange fileRange = KeyRangeRef(blockData.front().key, blockData.back().key);
@ -3656,6 +3717,12 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
for (; i < iend; ++i) {
tr->setOption(FDBTransactionOptions::NEXT_WRITE_NO_WRITE_CONFLICT_RANGE);
if (tenantCache.present()) {
validTenantCheckFutures.push_back(_validTenantAccess(
StringRef(arena,
data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get())),
tenantCache.get()));
}
tr->set(data[i].key.removePrefix(removePrefix.get()).withPrefix(addPrefix.get()),
data[i].value);
}
@ -3671,6 +3738,11 @@ struct RestoreRangeTaskFunc : RestoreFileTaskFuncBase {
wait(tr->commit());
if (!validTenantCheckFutures.empty()) {
waitForAll(validTenantCheckFutures);
validTenantCheckFutures.clear();
}
TraceEvent("FileRestoreCommittedRange")
.suppressFor(60)
.detail("RestoreUID", restore.getUid())
@ -4663,7 +4735,7 @@ struct StartFullRestoreTaskFunc : RestoreTaskFuncBase {
keyRangesFilter.push_back_deep(keyRangesFilter.arena(), KeyRangeRef(r));
}
state Optional<RestorableFileSet> restorable =
wait(bc->getRestoreSet(restoreVersion, cx, keyRangesFilter, logsOnly, beginVersion));
wait(bc->getRestoreSet(restoreVersion, keyRangesFilter, logsOnly, beginVersion));
if (!restorable.present())
throw restore_missing_data();
@ -4917,7 +4989,7 @@ public:
.detail("OverrideTargetVersion", targetVersion);
}
Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion, cx));
Optional<RestorableFileSet> restoreSet = wait(bc->getRestoreSet(targetVersion));
if (!restoreSet.present()) {
TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")
@ -5895,7 +5967,7 @@ public:
beginVersion = *std::min_element(beginVersions.begin(), beginVersions.end());
}
Optional<RestorableFileSet> restoreSet =
wait(bc->getRestoreSet(targetVersion, cx, ranges, onlyApplyMutationLogs, beginVersion));
wait(bc->getRestoreSet(targetVersion, ranges, onlyApplyMutationLogs, beginVersion));
if (!restoreSet.present()) {
TraceEvent(SevWarn, "FileBackupAgentRestoreNotPossible")

View File

@ -46,6 +46,7 @@
#include "flow/Platform.h"
#include "flow/ProtocolVersion.h"
#include "flow/UnitTest.h"
#include "flow/Trace.h"
#ifdef __unixish__
#include <fcntl.h>
@ -1466,8 +1467,11 @@ ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>> MultiVersionTransaction
Version beginVersion,
Optional<Version> readVersion,
Version* readVersionOut) {
// can't call this directly
return ThreadFuture<Standalone<VectorRef<BlobGranuleChunkRef>>>(unsupported_operation());
return executeOperation(&ITransaction::readBlobGranulesStart,
keyRange,
std::forward<Version>(beginVersion),
std::forward<Optional<Version>>(readVersion),
std::forward<Version*>(readVersionOut));
}
ThreadResult<RangeResult> MultiVersionTransaction::readBlobGranulesFinish(
@ -2903,123 +2907,129 @@ void MultiVersionApi::setNetworkOptionInternal(FDBNetworkOptions::Option option,
}
void MultiVersionApi::setupNetwork() {
if (!externalClient) {
loadEnvironmentVariableNetworkOptions();
}
uint64_t transportId = 0;
{ // lock scope
MutexHolder holder(lock);
if (networkStartSetup) {
throw network_already_setup();
try {
if (!externalClient) {
loadEnvironmentVariableNetworkOptions();
}
if (threadCount > 1) {
disableLocalClient();
}
uint64_t transportId = 0;
{ // lock scope
MutexHolder holder(lock);
if (networkStartSetup) {
throw network_already_setup();
}
if (!apiVersion.hasFailOnExternalClientErrors()) {
ignoreExternalClientFailures = true;
}
if (threadCount > 1) {
disableLocalClient();
}
for (auto i : externalClientDescriptions) {
std::string path = i.second.libPath;
std::string filename = basename(path);
bool useFutureVersion = i.second.useFutureVersion;
networkStartSetup = true;
// Copy external lib for each thread
if (externalClients.count(filename) == 0) {
externalClients[filename] = {};
auto libCopies = copyExternalLibraryPerThread(path);
for (int idx = 0; idx < libCopies.size(); ++idx) {
bool unlinkOnLoad = libCopies[idx].second && !retainClientLibCopies;
externalClients[filename].push_back(Reference<ClientInfo>(
new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
path,
useFutureVersion,
idx)));
if (externalClientDescriptions.empty() && localClientDisabled) {
TraceEvent(SevWarn, "CannotSetupNetwork")
.detail("Reason", "Local client is disabled and no external clients configured");
throw no_external_client_provided();
}
if (externalClientDescriptions.empty() && !disableBypass) {
bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to
// add clients after setupNetwork is called
}
if (!bypassMultiClientApi) {
transportId =
(uint64_t(uint32_t(platform::getRandomSeed())) << 32) ^ uint32_t(platform::getRandomSeed());
if (transportId <= 1)
transportId += 2;
localClient->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
std::to_string(transportId));
}
localClient->api->setupNetwork();
if (!apiVersion.hasFailOnExternalClientErrors()) {
ignoreExternalClientFailures = true;
}
for (auto i : externalClientDescriptions) {
std::string path = i.second.libPath;
std::string filename = basename(path);
bool useFutureVersion = i.second.useFutureVersion;
// Copy external lib for each thread
if (externalClients.count(filename) == 0) {
externalClients[filename] = {};
auto libCopies = copyExternalLibraryPerThread(path);
for (int idx = 0; idx < libCopies.size(); ++idx) {
bool unlinkOnLoad = libCopies[idx].second && !retainClientLibCopies;
externalClients[filename].push_back(Reference<ClientInfo>(
new ClientInfo(new DLApi(libCopies[idx].first, unlinkOnLoad /*unlink on load*/),
path,
useFutureVersion,
idx)));
}
}
}
}
if (externalClients.empty() && localClientDisabled) {
TraceEvent(SevWarn, "CannotSetupNetwork")
.detail("Reason", "Local client is disabled and no external clients configured");
localClient->loadVersion();
throw no_external_client_provided();
if (bypassMultiClientApi) {
networkSetup = true;
} else {
runOnExternalClientsAllThreads(
[this](Reference<ClientInfo> client) {
TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath);
client->api->selectApiVersion(apiVersion.version());
if (client->useFutureVersion) {
client->api->useFutureProtocolVersion();
}
client->loadVersion();
},
false,
!ignoreExternalClientFailures);
std::string baseTraceFileId;
if (apiVersion.hasTraceFileIdentifier()) {
// TRACE_FILE_IDENTIFIER option is supported since 6.3
baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier;
}
MutexHolder holder(lock);
runOnExternalClientsAllThreads(
[this, transportId, baseTraceFileId](Reference<ClientInfo> client) {
for (auto option : options) {
client->api->setNetworkOption(option.first, option.second.castTo<StringRef>());
}
client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
std::to_string(transportId));
if (!baseTraceFileId.empty()) {
client->api->setNetworkOption(FDBNetworkOptions::TRACE_FILE_IDENTIFIER,
traceShareBaseNameAmongThreads
? baseTraceFileId
: client->getTraceFileIdentifier(baseTraceFileId));
}
client->api->setupNetwork();
},
false,
!ignoreExternalClientFailures);
if (localClientDisabled && !hasNonFailedExternalClients()) {
TraceEvent(SevWarn, "CannotSetupNetwork")
.detail("Reason", "Local client is disabled and all external clients failed");
throw all_external_clients_failed();
}
networkSetup = true; // Needs to be guarded by mutex
}
networkStartSetup = true;
if (externalClients.empty() && !disableBypass) {
bypassMultiClientApi = true; // SOMEDAY: we won't be able to set this option once it becomes possible to
// add clients after setupNetwork is called
}
if (!bypassMultiClientApi) {
transportId = (uint64_t(uint32_t(platform::getRandomSeed())) << 32) ^ uint32_t(platform::getRandomSeed());
if (transportId <= 1)
transportId += 2;
localClient->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
std::to_string(transportId));
}
localClient->api->setupNetwork();
options.clear();
updateSupportedVersions();
} catch (Error& e) {
// Make sure all error and warning events are traced
flushTraceFileVoid();
throw e;
}
localClient->loadVersion();
if (bypassMultiClientApi) {
networkSetup = true;
} else {
runOnExternalClientsAllThreads(
[this](Reference<ClientInfo> client) {
TraceEvent("InitializingExternalClient").detail("LibraryPath", client->libPath);
client->api->selectApiVersion(apiVersion.version());
if (client->useFutureVersion) {
client->api->useFutureProtocolVersion();
}
client->loadVersion();
},
false,
!ignoreExternalClientFailures);
std::string baseTraceFileId;
if (apiVersion.hasTraceFileIdentifier()) {
// TRACE_FILE_IDENTIFIER option is supported since 6.3
baseTraceFileId = traceFileIdentifier.empty() ? format("%d", getpid()) : traceFileIdentifier;
}
MutexHolder holder(lock);
runOnExternalClientsAllThreads(
[this, transportId, baseTraceFileId](Reference<ClientInfo> client) {
for (auto option : options) {
client->api->setNetworkOption(option.first, option.second.castTo<StringRef>());
}
client->api->setNetworkOption(FDBNetworkOptions::EXTERNAL_CLIENT_TRANSPORT_ID,
std::to_string(transportId));
if (!baseTraceFileId.empty()) {
client->api->setNetworkOption(FDBNetworkOptions::TRACE_FILE_IDENTIFIER,
traceShareBaseNameAmongThreads
? baseTraceFileId
: client->getTraceFileIdentifier(baseTraceFileId));
}
client->api->setupNetwork();
},
false,
!ignoreExternalClientFailures);
if (localClientDisabled && !hasNonFailedExternalClients()) {
TraceEvent(SevWarn, "CannotSetupNetwork")
.detail("Reason", "Local client is disabled and all external clients failed");
throw all_external_clients_failed();
}
networkSetup = true; // Needs to be guarded by mutex
}
options.clear();
updateSupportedVersions();
}
THREAD_FUNC_RETURN runNetworkThread(void* param) {

View File

@ -161,7 +161,7 @@ TLSConfig tlsConfig(TLSEndpointType::CLIENT);
// The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h.
NetworkOptions::NetworkOptions()
: traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"),
traceFormat("xml"), traceClockSource("now"),
traceFormat("xml"), traceClockSource("now"), traceInitializeOnSetup(false),
supportedVersions(new ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>()), runLoopProfilingEnabled(false),
primaryClient(true) {}
@ -2217,6 +2217,99 @@ void DatabaseContext::expireThrottles() {
extern IPAddress determinePublicIPAutomatically(ClusterConnectionString& ccs);
// Initialize tracing for FDB client
//
// connRecord is necessary for determining the local IP, which is then included in the trace
// file name, and also used to annotate all trace events.
//
// If trace_initialize_on_setup is not set, tracing is initialized when opening a database.
// In that case we can immediatelly determine the IP. Thus, we can use the IP in the
// trace file name and annotate all events with it.
//
// If trace_initialize_on_setup network option is set, tracing is at first initialized without
// connRecord and thus without the local IP. In that case we cannot use the local IP in the
// trace file names. The IP is then provided by a repeated call to initializeClientTracing
// when opening a database. All tracing events from this point are annotated with the local IP
//
// If tracing initialization is completed, further calls to initializeClientTracing are ignored
void initializeClientTracing(Reference<IClusterConnectionRecord> connRecord, Optional<int> apiVersion) {
if (!networkOptions.traceDirectory.present()) {
return;
}
bool initialized = traceFileIsOpen();
if (initialized && (isTraceLocalAddressSet() || !connRecord)) {
// Tracing initialization is completed
return;
}
// Network must be created before initializing tracing
ASSERT(g_network);
Optional<NetworkAddress> localAddress;
if (connRecord) {
auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString());
localAddress = NetworkAddress(publicIP, ::getpid());
}
platform::ImageInfo imageInfo = platform::getImageInfo();
if (initialized) {
// Tracing already initialized, just need to update the IP address
setTraceLocalAddress(localAddress.get());
TraceEvent("ClientStart")
.detail("SourceVersion", getSourceVersion())
.detail("Version", FDB_VT_VERSION)
.detail("PackageName", FDB_VT_PACKAGE_NAME)
.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
.detail("ApiVersion", apiVersion)
.detail("ClientLibrary", imageInfo.fileName)
.detailf("ImageOffset", "%p", imageInfo.offset)
.detail("Primary", networkOptions.primaryClient)
.trackLatest("ClientStart");
} else {
// Initialize tracing
selectTraceFormatter(networkOptions.traceFormat);
selectTraceClockSource(networkOptions.traceClockSource);
addUniversalTraceField("ClientDescription",
format("%s-%s-%" PRIu64,
networkOptions.primaryClient ? "primary" : "external",
FDB_VT_VERSION,
deterministicRandom()->randomUInt64()));
std::string identifier = networkOptions.traceFileIdentifier;
openTraceFile(localAddress,
networkOptions.traceRollSize,
networkOptions.traceMaxLogsSize,
networkOptions.traceDirectory.get(),
"trace",
networkOptions.traceLogGroup,
identifier,
networkOptions.tracePartialFileSuffix);
TraceEvent("ClientStart")
.detail("SourceVersion", getSourceVersion())
.detail("Version", FDB_VT_VERSION)
.detail("PackageName", FDB_VT_PACKAGE_NAME)
.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
.detail("ApiVersion", apiVersion)
.detail("ClientLibrary", imageInfo.fileName)
.detailf("ImageOffset", "%p", imageInfo.offset)
.detail("Primary", networkOptions.primaryClient)
.trackLatest("ClientStart");
g_network->initMetrics();
FlowTransport::transport().initMetrics();
initTraceEventMetrics();
}
// Initialize system monitoring once the local IP is available
if (localAddress.present()) {
initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(localAddress.get().ip)));
systemMonitor();
uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
}
}
// Creates a database object that represents a connection to a cluster
// This constructor uses a preallocated DatabaseContext that may have been created
// on another thread
@ -2230,49 +2323,7 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord
ASSERT(TraceEvent::isNetworkThread());
platform::ImageInfo imageInfo = platform::getImageInfo();
if (connRecord) {
if (networkOptions.traceDirectory.present() && !traceFileIsOpen()) {
g_network->initMetrics();
FlowTransport::transport().initMetrics();
initTraceEventMetrics();
auto publicIP = determinePublicIPAutomatically(connRecord->getConnectionString());
selectTraceFormatter(networkOptions.traceFormat);
selectTraceClockSource(networkOptions.traceClockSource);
addUniversalTraceField("ClientDescription",
format("%s-%s-%" PRIu64,
networkOptions.primaryClient ? "primary" : "external",
FDB_VT_VERSION,
getTraceThreadId()));
openTraceFile(NetworkAddress(publicIP, ::getpid()),
networkOptions.traceRollSize,
networkOptions.traceMaxLogsSize,
networkOptions.traceDirectory.get(),
"trace",
networkOptions.traceLogGroup,
networkOptions.traceFileIdentifier,
networkOptions.tracePartialFileSuffix);
TraceEvent("ClientStart")
.detail("SourceVersion", getSourceVersion())
.detail("Version", FDB_VT_VERSION)
.detail("PackageName", FDB_VT_PACKAGE_NAME)
.detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr))
.detail("ApiVersion", apiVersion)
.detail("ClientLibrary", imageInfo.fileName)
.detailf("ImageOffset", "%p", imageInfo.offset)
.detail("Primary", networkOptions.primaryClient)
.trackLatest("ClientStart");
initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(publicIP)));
systemMonitor();
uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace));
}
}
initializeClientTracing(connRecord, apiVersion);
g_network->initTLS();
@ -2324,7 +2375,7 @@ Database Database::createDatabase(Reference<IClusterConnectionRecord> connRecord
.detail("Version", FDB_VT_VERSION)
.detail("ClusterFile", connRecord ? connRecord->toString() : "None")
.detail("ConnectionString", connRecord ? connRecord->getConnectionString().toString() : "None")
.detail("ClientLibrary", imageInfo.fileName)
.detail("ClientLibrary", platform::getImageInfo().fileName)
.detail("Primary", networkOptions.primaryClient)
.detail("Internal", internal)
.trackLatest(database->connectToDatabaseEventCacheHolder.trackingKey);
@ -2408,6 +2459,9 @@ void setNetworkOption(FDBNetworkOptions::Option option, Optional<StringRef> valu
validateOptionValuePresent(value);
networkOptions.tracePartialFileSuffix = value.get().toString();
break;
case FDBNetworkOptions::TRACE_INITIALIZE_ON_SETUP:
networkOptions.traceInitializeOnSetup = true;
break;
case FDBNetworkOptions::KNOB: {
validateOptionValuePresent(value);
@ -2608,6 +2662,10 @@ void setupNetwork(uint64_t transportId, UseMetrics useMetrics) {
FlowTransport::createInstance(true, transportId, WLTOKEN_RESERVED_COUNT);
Net2FileSystem::newFileSystem();
if (networkOptions.traceInitializeOnSetup) {
::initializeClientTracing({}, {});
}
uncancellable(monitorNetworkBusyness());
}
@ -2803,6 +2861,10 @@ int64_t Tenant::id() const {
return idFuture.get();
}
Future<int64_t> Tenant::getIdFuture() const {
return idFuture;
}
KeyRef Tenant::prefix() const {
ASSERT(idFuture.isReady());
if (bigEndianId == -1) {
@ -8599,24 +8661,36 @@ ACTOR Future<Optional<Standalone<VectorRef<KeyRef>>>> splitStorageMetricsWithLoc
try {
state int i = 0;
for (; i < locations.size(); i++) {
SplitMetricsRequest req(
locations[i].range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() && res.splits[0] <= results.back()) { // split points are out of order, possibly
// because of moving data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
state Key beginKey = locations[i].range.begin;
loop {
KeyRangeRef range(beginKey, locations[i].range.end);
SplitMetricsRequest req(range, limit, used, estimated, i == locations.size() - 1, minSplitBytes);
SplitMetricsReply res = wait(loadBalance(locations[i].locations->locations(),
&StorageServerInterface::splitMetrics,
req,
TaskPriority::DataDistribution));
if (res.splits.size() &&
res.splits[0] <= results.back()) { // split points are out of order, possibly
// because of moving data, throw error to retry
ASSERT_WE_THINK(false); // FIXME: This seems impossible and doesn't seem to be covered by testing
throw all_alternatives_failed();
}
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
if (res.splits.size()) {
results.append(results.arena(), res.splits.begin(), res.splits.size());
results.arena().dependsOn(res.splits.arena());
}
used = res.used;
if (res.more && res.splits.size()) {
// Next request will return split points after this one
beginKey = KeyRef(beginKey.arena(), res.splits.back());
} else {
break;
}
//TraceEvent("SplitStorageMetricsResult").detail("Used", used.bytes).detail("Location", i).detail("Size", res.splits.size());
}
}
if (used.allLessOrEqual(limit * CLIENT_KNOBS->STORAGE_METRICS_UNFAIR_SPLIT_LIMIT) && results.size() > 1) {
@ -10911,7 +10985,7 @@ ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange rang
Optional<Value> value = wait(tr->get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (status.phase != BlobRestorePhase::DONE) {
if (status.phase < BlobRestorePhase::DONE) {
return false; // stop if there is in-progress restore.
}
}

View File

@ -69,6 +69,11 @@ S3BlobStoreEndpoint::Stats S3BlobStoreEndpoint::Stats::operator-(const Stats& rh
}
S3BlobStoreEndpoint::Stats S3BlobStoreEndpoint::s_stats;
std::unique_ptr<S3BlobStoreEndpoint::BlobStats> S3BlobStoreEndpoint::blobStats;
Future<Void> S3BlobStoreEndpoint::statsLogger = Never();
std::unordered_map<BlobStoreConnectionPoolKey, Reference<S3BlobStoreEndpoint::ConnectionPoolData>>
S3BlobStoreEndpoint::globalConnectionPool;
S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
secure_connection = 1;
@ -96,6 +101,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
max_send_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
max_recv_bytes_per_second = CLIENT_KNOBS->BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
sdk_auth = false;
global_connection_pool = CLIENT_KNOBS->BLOBSTORE_GLOBAL_CONNECTION_POOL;
}
bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
@ -134,6 +140,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(max_send_bytes_per_second, sbps);
TRY_PARAM(max_recv_bytes_per_second, rbps);
TRY_PARAM(sdk_auth, sa);
TRY_PARAM(global_connection_pool, gcp);
#undef TRY_PARAM
return false;
}
@ -171,6 +178,8 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
_CHECK_PARAM(read_cache_blocks_per_file, rcb);
_CHECK_PARAM(max_send_bytes_per_second, sbps);
_CHECK_PARAM(max_recv_bytes_per_second, rbps);
_CHECK_PARAM(sdk_auth, sa);
_CHECK_PARAM(global_connection_pool, gcp);
#undef _CHECK_PARAM
return r;
}
@ -721,20 +730,23 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
bool* reusingConn) {
// First try to get a connection from the pool
*reusingConn = false;
while (!b->connectionPool.empty()) {
S3BlobStoreEndpoint::ReusableConnection rconn = b->connectionPool.front();
b->connectionPool.pop();
while (!b->connectionPool->pool.empty()) {
S3BlobStoreEndpoint::ReusableConnection rconn = b->connectionPool->pool.front();
b->connectionPool->pool.pop();
// If the connection expires in the future then return it
if (rconn.expirationTime > now()) {
*reusingConn = true;
++b->blobStats->reusedConnections;
TraceEvent("S3BlobStoreEndpointReusingConnected")
.suppressFor(60)
.detail("RemoteEndpoint", rconn.conn->getPeerAddress())
.detail("ExpiresIn", rconn.expirationTime - now());
return rconn;
}
++b->blobStats->expiredConnections;
}
++b->blobStats->newConnections;
std::string host = b->host, service = b->service;
if (service.empty()) {
if (b->useProxy) {
@ -743,7 +755,7 @@ ACTOR Future<S3BlobStoreEndpoint::ReusableConnection> connect_impl(Reference<S3B
}
service = b->knobs.secure_connection ? "https" : "http";
}
bool isTLS = b->knobs.secure_connection == 1;
bool isTLS = b->knobs.isTLS();
state Reference<IConnection> conn;
if (b->useProxy) {
if (isTLS) {
@ -779,7 +791,9 @@ Future<S3BlobStoreEndpoint::ReusableConnection> S3BlobStoreEndpoint::connect(boo
void S3BlobStoreEndpoint::returnConnection(ReusableConnection& rconn) {
// If it expires in the future then add it to the pool in the front
if (rconn.expirationTime > now()) {
connectionPool.push(rconn);
connectionPool->pool.push(rconn);
} else {
++blobStats->expiredConnections;
}
rconn.conn = Reference<IConnection>();
}
@ -945,6 +959,8 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
// received the "Connection: close" header.
if (r->headers["Connection"] != "close") {
bstore->returnConnection(rconn);
} else {
++bstore->blobStats->expiredConnections;
}
rconn.conn.clear();
@ -958,16 +974,19 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
double end = g_network->timer();
double connectDuration = reqStartTimer - connectStartTimer;
double reqDuration = end - reqStartTimer;
bstore->blobStats->requestLatency.addMeasurement(reqDuration);
// If err is not present then r is valid.
// If r->code is in successCodes then record the successful request and return r.
if (!err.present() && successCodes.count(r->code) != 0) {
bstore->s_stats.requests_successful++;
++bstore->blobStats->requestsSuccessful;
return r;
}
// Otherwise, this request is considered failed. Update failure count.
bstore->s_stats.requests_failed++;
++bstore->blobStats->requestsFailed;
// All errors in err are potentially retryable as well as certain HTTP response codes...
bool retryable = err.present() || r->code == 500 || r->code == 502 || r->code == 503 || r->code == 429;
@ -1014,6 +1033,7 @@ ACTOR Future<Reference<HTTP::Response>> doRequest_impl(Reference<S3BlobStoreEndp
++thisTry;
if (fastRetry) {
++bstore->blobStats->fastRetries;
wait(delay(0));
} else if (retryable) {
// We will wait delay seconds before the next retry, start with nextRetryDelay.

View File

@ -981,6 +981,10 @@ const KeyRef JSONSchemas::statusSchema = R"statusSchema(
"expired_age": 0,
"oldest_id_version": 0,
"oldest_id_age": 0
},
"version_epoch":{
"enabled": false,
"epoch": 0
}
},
"client":{

View File

@ -70,7 +70,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( MAX_MESSAGE_SIZE, std::max<int>(LOG_SYSTEM_PUSHED_DATA_BLOCK_SIZE, 1e5 + 2e4 + 1) + 8 ); // VALUE_SIZE_LIMIT + SYSTEM_KEY_SIZE_LIMIT + 9 bytes (4 bytes for length, 4 bytes for sequence number, and 1 byte for mutation type)
init( TLOG_MESSAGE_BLOCK_BYTES, 10e6 );
init( TLOG_MESSAGE_BLOCK_OVERHEAD_FACTOR, double(TLOG_MESSAGE_BLOCK_BYTES) / (TLOG_MESSAGE_BLOCK_BYTES - MAX_MESSAGE_SIZE) ); //1.0121466709838096006362758832473
init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = deterministicRandom()->coinflip() ? 0.1 : 120;
init( PEEK_TRACKER_EXPIRATION_TIME, 600 ); if( randomize && BUGGIFY ) PEEK_TRACKER_EXPIRATION_TIME = 120; // Cannot be buggified lower without changing the following assert in LogSystemPeekCursor.actor.cpp: ASSERT_WE_THINK(e.code() == error_code_operation_obsolete || SERVER_KNOBS->PEEK_TRACKER_EXPIRATION_TIME < 10);
init( PEEK_USING_STREAMING, false ); if( randomize && isSimulated && BUGGIFY ) PEEK_USING_STREAMING = true;
init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2;
init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 );
@ -736,8 +736,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BW_THROTTLING_ENABLED, true );
bool buggifySmallBWLag = randomize && BUGGIFY;
init( TARGET_BW_LAG, 240.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0;
init( TARGET_BW_LAG_BATCH, 200.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0;
init( TARGET_BW_LAG, 90.0 ); if(buggifySmallBWLag) TARGET_BW_LAG = 10.0;
init( TARGET_BW_LAG_BATCH, 60.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_BATCH = 4.0;
init( TARGET_BW_LAG_UPDATE, 9.0 ); if(buggifySmallBWLag) TARGET_BW_LAG_UPDATE = 1.0;
init( MIN_BW_HISTORY, 10 );
init( BW_ESTIMATION_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_ESTIMATION_INTERVAL = 2.0;
@ -746,7 +746,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BW_FETCH_WORKERS_INTERVAL, 5.0 );
init( BW_RW_LOGGING_INTERVAL, 5.0 );
init( BW_MAX_BLOCKED_INTERVAL, 10.0 ); if(buggifySmallBWLag) BW_MAX_BLOCKED_INTERVAL = 2.0;
init( BW_RK_SIM_QUIESCE_DELAY, 150.0 );
init( BW_RK_SIM_QUIESCE_DELAY, 300.0 );
init( MAX_AUTO_THROTTLED_TRANSACTION_TAGS, 5 ); if(randomize && BUGGIFY) MAX_AUTO_THROTTLED_TRANSACTION_TAGS = 1;
init( MAX_MANUAL_THROTTLED_TRANSACTION_TAGS, 40 ); if(randomize && BUGGIFY) MAX_MANUAL_THROTTLED_TRANSACTION_TAGS = 1;
@ -850,6 +850,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
// The enumeration is currently: eager, fetch, low, normal, high
init( STORAGESERVER_READTYPE_PRIORITY_MAP, "0,1,2,3,4" );
init( SPLIT_METRICS_MAX_ROWS, 10000 );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -974,7 +975,6 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_IO_PRIORITIES, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
init( LATENCY_SKETCH_ACCURACY, 0.01 );
@ -1005,9 +1005,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
init( BG_METADATA_SOURCE, "knobs" );
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 20000000 ); if ( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 50000 * deterministicRandom()->randomInt(1, 4); else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 50000 * deterministicRandom()->randomInt(1, 20);
init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8));
init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 );
init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 ); if ( randomize && BUGGIFY ) BG_DELTA_BYTES_BEFORE_COMPACT *= (1.0 + deterministicRandom()->random01() * 3.0)/2.0;
init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 );
init( BG_DELTA_FILE_TARGET_CHUNK_BYTES, 32*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7));
init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15);
@ -1021,16 +1021,21 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_WRITE_MULTIPART, false ); if (randomize && BUGGIFY) BG_WRITE_MULTIPART = true;
init( BG_ENABLE_DYNAMIC_WRITE_AMP, true ); if (randomize && BUGGIFY) BG_ENABLE_DYNAMIC_WRITE_AMP = false;
init( BG_DYNAMIC_WRITE_AMP_MIN_FACTOR, 0.5 );
init( BG_DYNAMIC_WRITE_AMP_DECREASE_FACTOR, 0.8 );
init( BG_ENABLE_MERGING, true ); if (randomize && BUGGIFY) BG_ENABLE_MERGING = false;
init( BG_MERGE_CANDIDATE_THRESHOLD_SECONDS, isSimulated ? 20.0 : 30 * 60 ); if (randomize && BUGGIFY) BG_MERGE_CANDIDATE_THRESHOLD_SECONDS = 5.0;
init( BG_MERGE_CANDIDATE_DELAY_SECONDS, BG_MERGE_CANDIDATE_THRESHOLD_SECONDS / 10.0 );
init( BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM, 8 ); if( randomize && BUGGIFY ) BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM = 1;
// The resnapshot/delta parallelism knobs are deprecated and replaced by the budget_bytes knobs! FIXME: remove after next release
init( BLOB_WORKER_RESNAPSHOT_PARALLELISM, 40 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_PARALLELISM = deterministicRandom()->randomInt(1, 10);
init( BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM, 2000 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM = deterministicRandom()->randomInt(10, 100);
init( BLOB_WORKER_RDC_PARALLELISM, 2 ); if( randomize && BUGGIFY ) BLOB_WORKER_RDC_PARALLELISM = deterministicRandom()->randomInt(1, 6);
init( BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES, 1024*1024*1024 ); if( randomize && BUGGIFY ) BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES = deterministicRandom()->random01() * 10 * BG_SNAPSHOT_FILE_TARGET_BYTES;
init( BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES, 1024*1024*1024 ); if( randomize && BUGGIFY ) BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES = (5 + 45*deterministicRandom()->random01()) * BG_DELTA_FILE_TARGET_BYTES;
init( BLOB_WORKER_TIMEOUT, 10.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_TIMEOUT = 1.0;
init( BLOB_WORKER_REQUEST_TIMEOUT, 5.0 ); if( randomize && BUGGIFY ) BLOB_WORKER_REQUEST_TIMEOUT = 1.0;
init( BLOB_WORKERLIST_FETCH_INTERVAL, 1.0 );
@ -1050,6 +1055,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_RW_ROWS, isSimulated ? 10 : 1000);
init( BLOB_RESTORE_MLOGS_URL, isSimulated ? "file://simfdb/backups/" : "");
init( BLOB_MIGRATOR_ERROR_RETRIES, 20);
init( BLOB_RESTORE_MANIFEST_URL, isSimulated ? "file://simfdb/fdbblob/manifest" : "");
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );

View File

@ -43,9 +43,11 @@ int64_t extractTenantIdFromMutation(MutationRef m) {
if (isSingleKeyMutation((MutationRef::Type)m.type)) {
// The first 8 bytes of the key of this OP is also an 8-byte number
if (m.type == MutationRef::SetVersionstampedKey && m.param1.size() >= 4 &&
parseVersionstampOffset(m.param1) < 8) {
return TenantInfo::INVALID_TENANT;
if (m.type == MutationRef::SetVersionstampedKey && m.param1.size() >= 4) {
// when the timestamp overlap with first 8 bytes
if (parseVersionstampOffset(m.param1) < 8) {
return TenantInfo::INVALID_TENANT;
}
}
} else {
// Assumes clear range mutations are split on tenant boundaries

View File

@ -245,7 +245,7 @@ Reference<ITransaction> ThreadSafeTenant::createTransaction() {
ThreadFuture<int64_t> ThreadSafeTenant::getId() {
Tenant* tenant = this->tenant;
return onMainThread([tenant]() -> Future<int64_t> { return tenant->id(); });
return onMainThread([tenant]() -> Future<int64_t> { return tenant->getIdFuture(); });
}
ThreadFuture<Key> ThreadSafeTenant::purgeBlobGranules(const KeyRangeRef& keyRange, Version purgeVersion, bool force) {

View File

@ -59,8 +59,8 @@ struct AuditStorageState {
std::string toString() const {
std::string res = "AuditStorageState: [ID]: " + id.toString() +
"[Range]: " + Traceable<KeyRangeRef>::toString(range) + "[Type]: " + std::to_string(type) +
"[Phase]: " + std::to_string(phase);
", [Range]: " + Traceable<KeyRangeRef>::toString(range) +
", [Type]: " + std::to_string(type) + ", [Phase]: " + std::to_string(phase);
if (!error.empty()) {
res += "[Error]: " + error;
}

View File

@ -1004,7 +1004,7 @@ namespace fileBackup {
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeRangeFileBlock(Reference<IAsyncFile> file,
int64_t offset,
int len,
Optional<Database> cx);
Database cx);
// Reads a mutation log block from file and parses into batch mutation blocks for further parsing.
ACTOR Future<Standalone<VectorRef<KeyValueRef>>> decodeMutationLogFileBlock(Reference<IAsyncFile> file,

View File

@ -253,7 +253,7 @@ public:
// Returns the key ranges in the snapshot file. This is an expensive function
// and should only be used in simulation for sanity check.
virtual Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) = 0;
virtual Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Database cx) = 0;
struct ExpireProgress {
std::string step;
@ -292,7 +292,6 @@ public:
// If logsOnly is set, only use log files in [beginVersion, targetVervions) in restore set.
// Returns non-present if restoring to the given version is not possible.
virtual Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion,
Optional<Database> cx,
VectorRef<KeyRangeRef> keyRangesFilter = {},
bool logsOnly = false,
Version beginVersion = -1) = 0;

View File

@ -155,10 +155,9 @@ public:
ExpireProgress* progress,
Version restorableBeginVersion) final;
Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Optional<Database> cx) final;
Future<KeyRange> getSnapshotFileKeyRange(const RangeFile& file, Database cx) final;
Future<Optional<RestorableFileSet>> getRestoreSet(Version targetVersion,
Optional<Database> cx,
VectorRef<KeyRangeRef> keyRangesFilter,
bool logsOnly,
Version beginVersion) final;

View File

@ -70,6 +70,7 @@ public:
KV_REDWOOD,
BLOB_GRANULE,
BACKUP,
RESTORE,
TEST,
MAX,
};

View File

@ -55,6 +55,23 @@ struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
}
};
#pragma pack(push, 4)
struct GranuleMutationRef {
MutationRef::Type type;
Version version;
StringRef param1;
StringRef param2;
GranuleMutationRef() {}
GranuleMutationRef(MutationRef::Type t, Version v, StringRef param1, StringRef param2)
: type(t), version(v), param1(param1), param2(param2) {}
GranuleMutationRef(Arena& to, MutationRef::Type t, Version v, StringRef param1, StringRef param2)
: type(t), version(v), param1(to, param1), param2(to, param2) {}
GranuleMutationRef(Arena& to, const GranuleMutationRef& from)
: type(from.type), version(from.version), param1(to, from.param1), param2(to, from.param2) {}
};
#pragma pack(pop)
struct GranuleMaterializeStats {
// file-level stats
int64_t inputBytes;

View File

@ -51,7 +51,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
Version beginVersion,
Version readVersion,
Optional<StringRef> snapshotData,
StringRef deltaFileData[],
const std::vector<StringRef>& deltaFileData,
GranuleMaterializeStats& stats);
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
@ -59,4 +59,8 @@ std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, s
// For benchmark testing only. It should never be called in prod.
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);
// just for client passthrough. reads all key-value pairs from a snapshot file, and all mutations from a delta file
RangeResult bgReadSnapshotFile(const StringRef& data);
Standalone<VectorRef<GranuleMutationRef>> bgReadDeltaFile(const StringRef& data);
#endif

View File

@ -0,0 +1,187 @@
/*
* BlobGranuleRequest.actor.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
// When actually compiled (NO_INTELLISENSE), include the generated version of this file. In intellisense use the source
// version.
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_G_H)
#define FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_G_H
#include "fdbclient/BlobGranuleRequest.actor.g.h"
#elif !defined(FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_H)
#define FDBCLIENT_BLOB_GRANULE_REQUEST_ACTOR_H
#include "flow/flow.h"
#include "flow/Knobs.h"
// #include "fdbclient/NativeAPI.actor.h"
#include "flow/Arena.h"
#include "fdbclient/DatabaseContext.h"
#include "fdbclient/BlobWorkerInterface.h"
#include "flow/actorcompiler.h" // This must be the last #include.
#define BGR_DEBUG false
ACTOR template <class Request, bool P>
Future<Standalone<VectorRef<REPLY_TYPE(Request)>>> txnDoBlobGranuleRequests(
Transaction* tr,
Key* beginKey,
Key endKey,
Request request,
RequestStream<Request, P> BlobWorkerInterface::*channel) {
// TODO KNOB
state RangeResult blobGranuleMapping = wait(krmGetRanges(
tr, blobGranuleMappingKeys.begin, KeyRangeRef(*beginKey, endKey), 64, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
state int i = 0;
state std::vector<Future<ErrorOr<REPLY_TYPE(Request)>>> requests;
state Standalone<VectorRef<REPLY_TYPE(Request)>> results;
for (; i < blobGranuleMapping.size() - 1; i++) {
if (!blobGranuleMapping[i].value.size()) {
if (BGR_DEBUG) {
fmt::print("ERROR: No valid granule data for range [{0} - {1}) \n",
blobGranuleMapping[i].key.printable(),
blobGranuleMapping[i + 1].key.printable());
}
// no granule for range
throw blob_granule_transaction_too_old();
}
state UID workerId = decodeBlobGranuleMappingValue(blobGranuleMapping[i].value);
if (workerId == UID()) {
if (BGR_DEBUG) {
fmt::print("ERROR: Invalid Blob Worker ID for range [{0} - {1}) \n",
blobGranuleMapping[i].key.printable(),
blobGranuleMapping[i + 1].key.printable());
}
// no worker for granule
throw blob_granule_transaction_too_old();
}
if (!tr->trState->cx->blobWorker_interf.count(workerId)) {
Optional<Value> workerInterface = wait(tr->get(blobWorkerListKeyFor(workerId)));
// from the time the mapping was read from the db, the associated blob worker
// could have died and so its interface wouldn't be present as part of the blobWorkerList
// we persist in the db.
if (workerInterface.present()) {
tr->trState->cx->blobWorker_interf[workerId] = decodeBlobWorkerListValue(workerInterface.get());
} else {
if (BGR_DEBUG) {
fmt::print("ERROR: Worker for range [{1} - {2}) does not exist!\n",
workerId.toString().substr(0, 5),
blobGranuleMapping[i].key.printable(),
blobGranuleMapping[i + 1].key.printable());
}
// throw to force read version to increase and to retry reading mapping
throw transaction_too_old();
}
}
if (BGR_DEBUG) {
fmt::print("Requesting range [{0} - {1}) from worker {2}!\n",
blobGranuleMapping[i].key.printable(),
blobGranuleMapping[i + 1].key.printable(),
workerId.toString().substr(0, 5));
}
KeyRangeRef range(blobGranuleMapping[i].key, blobGranuleMapping[i + 1].key);
request.reply.reset();
request.setRange(range);
// TODO consolidate?
BlobWorkerInterface bwi = tr->trState->cx->blobWorker_interf[workerId];
RequestStream<Request, P> const* stream = &(bwi.*channel);
Future<ErrorOr<REPLY_TYPE(Request)>> response = stream->tryGetReply(request);
requests.push_back(response);
}
// wait for each request. If it has an error, retry from there if it is a retriable error
state int j = 0;
for (; j < requests.size(); j++) {
try {
ErrorOr<REPLY_TYPE(Request)> result = wait(requests[j]);
if (result.isError()) {
throw result.getError();
}
results.push_back(results.arena(), result.get());
} catch (Error& e) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_request_maybe_delivered ||
e.code() == error_code_broken_promise || e.code() == error_code_connection_failed) {
// re-read mapping and retry from failed req
i = j;
break;
} else {
if (BGR_DEBUG) {
fmt::print("ERROR: Error doing request for range [{0} - {1}): {2}!\n",
blobGranuleMapping[j].key.printable(),
blobGranuleMapping[j + 1].key.printable(),
e.name());
}
throw;
}
}
}
if (i < blobGranuleMapping.size() - 1) {
// a request failed, retry from there after a sleep
*beginKey = blobGranuleMapping[i].key;
wait(delay(FLOW_KNOBS->PREVENT_FAST_SPIN_DELAY));
} else if (blobGranuleMapping.more) {
*beginKey = blobGranuleMapping.back().key;
// no requests failed but there is more to read, continue reading
} else {
*beginKey = endKey;
}
return results;
}
// FIXME: port other request types to this function
ACTOR template <class Request, bool P>
Future<Standalone<VectorRef<REPLY_TYPE(Request)>>> doBlobGranuleRequests(
Database cx,
KeyRange range,
Request request,
RequestStream<Request, P> BlobWorkerInterface::*channel) {
state Key beginKey = range.begin;
state Key endKey = range.end;
state Transaction tr(cx);
state Standalone<VectorRef<REPLY_TYPE(Request)>> results;
loop {
if (beginKey >= endKey) {
return results;
}
try {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
Standalone<VectorRef<REPLY_TYPE(Request)>> partialResults =
wait(txnDoBlobGranuleRequests(&tr, &beginKey, endKey, request, channel));
if (!partialResults.empty()) {
results.arena().dependsOn(partialResults.arena());
results.append(results.arena(), partialResults.begin(), partialResults.size());
}
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
#include "flow/unactorcompiler.h"
#endif

View File

@ -46,10 +46,12 @@ struct BlobWorkerStats {
Counter fullRejections;
Counter forceFlushCleanups;
Counter readDrivenCompactions;
Counter oldFeedSnapshots;
int numRangesAssigned;
int mutationBytesBuffered;
int activeReadRequests;
// TODO: add gauge for granules blocking on old snapshots, once this guage is fixed
int granulesPendingSplitCheck;
Version minimumCFVersion;
Version cfVersionLag;
@ -63,8 +65,8 @@ struct BlobWorkerStats {
LatencySample readLatencySample;
Reference<FlowLock> initialSnapshotLock;
Reference<FlowLock> resnapshotLock;
Reference<FlowLock> deltaWritesLock;
Reference<FlowLock> resnapshotBudget;
Reference<FlowLock> deltaWritesBudget;
Future<Void> logger;
@ -72,8 +74,8 @@ struct BlobWorkerStats {
explicit BlobWorkerStats(UID id,
double interval,
Reference<FlowLock> initialSnapshotLock,
Reference<FlowLock> resnapshotLock,
Reference<FlowLock> deltaWritesLock,
Reference<FlowLock> resnapshotBudget,
Reference<FlowLock> deltaWritesBudget,
double sampleLoggingInterval,
double fileOpLatencySketchAccuracy,
double requestLatencySketchAccuracy)
@ -93,17 +95,17 @@ struct BlobWorkerStats {
flushGranuleReqs("FlushGranuleReqs", cc), compressionBytesRaw("CompressionBytesRaw", cc),
compressionBytesFinal("CompressionBytesFinal", cc), fullRejections("FullRejections", cc),
forceFlushCleanups("ForceFlushCleanups", cc), readDrivenCompactions("ReadDrivenCompactions", cc),
numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0), granulesPendingSplitCheck(0),
minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0), lastResidentMemory(0),
snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics",
id,
sampleLoggingInterval,
fileOpLatencySketchAccuracy),
oldFeedSnapshots("OldFeedSnapshots", cc), numRangesAssigned(0), mutationBytesBuffered(0), activeReadRequests(0),
granulesPendingSplitCheck(0), minimumCFVersion(0), cfVersionLag(0), notAtLatestChangeFeeds(0),
lastResidentMemory(0), snapshotBlobWriteLatencySample("SnapshotBlobWriteMetrics",
id,
sampleLoggingInterval,
fileOpLatencySketchAccuracy),
deltaBlobWriteLatencySample("DeltaBlobWriteMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
reSnapshotLatencySample("GranuleResnapshotMetrics", id, sampleLoggingInterval, fileOpLatencySketchAccuracy),
readLatencySample("GranuleReadLatencyMetrics", id, sampleLoggingInterval, requestLatencySketchAccuracy),
estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotLock(resnapshotLock),
deltaWritesLock(deltaWritesLock) {
estimatedMaxResidentMemory(0), initialSnapshotLock(initialSnapshotLock), resnapshotBudget(resnapshotBudget),
deltaWritesBudget(deltaWritesBudget) {
specialCounter(cc, "NumRangesAssigned", [this]() { return this->numRangesAssigned; });
specialCounter(cc, "MutationBytesBuffered", [this]() { return this->mutationBytesBuffered; });
specialCounter(cc, "ActiveReadRequests", [this]() { return this->activeReadRequests; });
@ -115,10 +117,10 @@ struct BlobWorkerStats {
specialCounter(cc, "EstimatedMaxResidentMemory", [this]() { return this->estimatedMaxResidentMemory; });
specialCounter(cc, "InitialSnapshotsActive", [this]() { return this->initialSnapshotLock->activePermits(); });
specialCounter(cc, "InitialSnapshotsWaiting", [this]() { return this->initialSnapshotLock->waiters(); });
specialCounter(cc, "ReSnapshotsActive", [this]() { return this->resnapshotLock->activePermits(); });
specialCounter(cc, "ReSnapshotsWaiting", [this]() { return this->resnapshotLock->waiters(); });
specialCounter(cc, "DeltaFileWritesActive", [this]() { return this->deltaWritesLock->activePermits(); });
specialCounter(cc, "DeltaFileWritesWaiting", [this]() { return this->deltaWritesLock->waiters(); });
specialCounter(cc, "ReSnapshotBytesActive", [this]() { return this->resnapshotBudget->activePermits(); });
specialCounter(cc, "ReSnapshotBytesWaiting", [this]() { return this->resnapshotBudget->waiters(); });
specialCounter(cc, "DeltaFileWriteBytesActive", [this]() { return this->deltaWritesBudget->activePermits(); });
specialCounter(cc, "DeltaFileWriteBytesWaiting", [this]() { return this->deltaWritesBudget->waiters(); });
logger = cc.traceCounters("BlobWorkerMetrics", id, interval, "BlobWorkerMetrics");
}

View File

@ -332,15 +332,19 @@ struct FlushGranuleRequest {
int64_t managerEpoch;
KeyRange granuleRange;
Version flushVersion;
bool compactAfter;
ReplyPromise<Void> reply;
FlushGranuleRequest() : managerEpoch(-1), flushVersion(invalidVersion) {}
explicit FlushGranuleRequest(int64_t managerEpoch, KeyRange granuleRange, Version flushVersion)
: managerEpoch(managerEpoch), granuleRange(granuleRange), flushVersion(flushVersion) {}
FlushGranuleRequest() : managerEpoch(-1), flushVersion(invalidVersion), compactAfter(false) {}
explicit FlushGranuleRequest(int64_t managerEpoch, KeyRange granuleRange, Version flushVersion, bool compactAfter)
: managerEpoch(managerEpoch), granuleRange(granuleRange), flushVersion(flushVersion), compactAfter(compactAfter) {
}
void setRange(const KeyRangeRef& range) { granuleRange = range; }
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, managerEpoch, granuleRange, flushVersion, reply);
serializer(ar, managerEpoch, granuleRange, flushVersion, compactAfter, reply);
}
};

View File

@ -239,7 +239,11 @@ public:
int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
int BLOBSTORE_MAX_SEND_BYTES_PER_SECOND;
int BLOBSTORE_MAX_RECV_BYTES_PER_SECOND;
int BGR_READ_BLOCK_SIZE;
bool BLOBSTORE_GLOBAL_CONNECTION_POOL;
bool BLOBSTORE_ENABLE_LOGGING;
double BLOBSTORE_STATS_LOGGING_INTERVAL;
double BLOBSTORE_LATENCY_LOGGING_INTERVAL;
double BLOBSTORE_LATENCY_LOGGING_ACCURACY;
int CONSISTENCY_CHECK_RATE_LIMIT_MAX;
int CONSISTENCY_CHECK_ONE_ROUND_TARGET_COMPLETION_TIME;
@ -293,6 +297,10 @@ public:
// Encryption-at-rest
bool ENABLE_ENCRYPTION_CPU_TIME_LOGGING;
// This Knob will be a comma-delimited string (i.e 0,1,2,3) that specifies which tenants the the EKP should throw
// key_not_found errors for. If TenantInfo::INVALID_TENANT is contained within the list then no tenants will be
// dropped. This Knob should ONLY be used in simulation for testing purposes
std::string SIMULATION_EKP_TENANT_IDS_TO_DROP;
ClientKnobs(Randomize randomize);
void initialize(Randomize randomize);

View File

@ -512,10 +512,11 @@ struct GetStorageServerRejoinInfoReply {
Optional<Tag> newTag;
bool newLocality;
std::vector<std::pair<Version, Tag>> history;
EncryptionAtRestMode encryptMode;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, version, tag, newTag, newLocality, history);
serializer(ar, version, tag, newTag, newLocality, history, encryptMode);
}
};

View File

@ -313,6 +313,11 @@ struct CommitTransactionRef {
bool lock_aware = false; // set when metadata mutations are present
Optional<SpanContext> spanContext;
// set by Commit Proxy
// The tenants associated with this transaction. This field only existing
// when tenant mode is required and this transaction has metadata mutations
Optional<VectorRef<int64_t>> tenantIds;
template <class Ar>
force_inline void serialize(Ar& ar) {
if constexpr (is_fb_function<Ar>) {
@ -323,7 +328,8 @@ struct CommitTransactionRef {
read_snapshot,
report_conflicting_keys,
lock_aware,
spanContext);
spanContext,
tenantIds);
} else {
serializer(ar, read_conflict_ranges, write_conflict_ranges, mutations, read_snapshot);
if (ar.protocolVersion().hasReportConflictingKeys()) {

View File

@ -1520,6 +1520,8 @@ struct EncryptionAtRestMode {
bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); }
bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); }
bool operator==(Mode m) const { return mode == m; }
bool operator!=(Mode m) const { return mode != m; }
bool isEncryptionEnabled() const { return mode != EncryptionAtRestMode::DISABLED; }
@ -1548,6 +1550,11 @@ struct EncryptionAtRestMode {
uint32_t mode;
};
template <>
struct Traceable<EncryptionAtRestMode> : std::true_type {
static std::string toString(const EncryptionAtRestMode& mode) { return mode.toString(); }
};
typedef StringRef ClusterNameRef;
typedef Standalone<ClusterNameRef> ClusterName;

View File

@ -19,6 +19,7 @@
*/
#pragma once
#include "flow/EncryptUtils.h"
#include "flow/genericactors.actor.h"
#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H)
#define FDBCLIENT_GETCIPHERKEYS_ACTOR_G_H
#include "fdbclient/GetEncryptCipherKeys.actor.g.h"
@ -27,7 +28,9 @@
#include "fdbclient/BlobCipher.h"
#include "fdbclient/EncryptKeyProxyInterface.h"
#include "fdbclient/Knobs.h"
#include "fdbrpc/Stats.h"
#include "fdbrpc/TenantInfo.h"
#include "flow/Knobs.h"
#include "flow/IRandom.h"
@ -182,6 +185,18 @@ Future<EKPGetBaseCipherKeysByIdsReply> getUncachedEncryptCipherKeys(Reference<As
TraceEvent(SevWarn, "GetEncryptCipherKeys_RequestFailed").error(reply.error.get());
throw encrypt_keys_fetch_failed();
}
if (g_network && g_network->isSimulated() && usageType == BlobCipherMetrics::RESTORE) {
std::unordered_set<int64_t> tenantIdsToDrop =
parseStringToUnorderedSet<int64_t>(CLIENT_KNOBS->SIMULATION_EKP_TENANT_IDS_TO_DROP, ',');
if (!tenantIdsToDrop.count(TenantInfo::INVALID_TENANT)) {
for (auto& baseCipherInfo : request.baseCipherInfos) {
if (tenantIdsToDrop.count(baseCipherInfo.domainId)) {
TraceEvent("GetEncryptCipherKeys_SimulatedError").detail("DomainId", baseCipherInfo.domainId);
throw encrypt_keys_fetch_failed();
}
}
}
}
return reply;
} catch (Error& e) {
TraceEvent("GetEncryptCipherKeys_CaughtError").error(e);

View File

@ -604,7 +604,7 @@ struct RegisterClusterImpl {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
state Future<std::vector<std::pair<TenantName, TenantMapEntry>>> existingTenantsFuture =
state Future<std::vector<std::pair<TenantName, int64_t>>> existingTenantsFuture =
TenantAPI::listTenantsTransaction(tr, ""_sr, "\xff\xff"_sr, 1);
state ThreadFuture<RangeResult> existingDataFuture = tr->getRange(normalKeys, 1);
@ -624,7 +624,7 @@ struct RegisterClusterImpl {
}
// Check for any existing data
std::vector<std::pair<TenantName, TenantMapEntry>> existingTenants =
std::vector<std::pair<TenantName, int64_t>> existingTenants =
wait(safeThreadFutureToFuture(existingTenantsFuture));
if (!existingTenants.empty()) {
TraceEvent(SevWarn, "CannotRegisterClusterWithTenants").detail("ClusterName", self->clusterName);
@ -1544,34 +1544,72 @@ Future<Void> deleteTenant(Reference<DB> db, int64_t id) {
return Void();
}
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
TenantNameRef begin,
TenantNameRef end,
int limit) {
template <class Transaction>
Future<std::vector<std::pair<TenantName, int64_t>>> listTenantsTransaction(Transaction tr,
TenantName begin,
TenantName end,
int limit,
int offset = 0) {
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
auto future = ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange(tr, begin, end, limit + offset);
return fmap(
[offset](auto f) {
std::vector<std::pair<TenantName, int64_t>>& results = f.results;
results.erase(results.begin(), results.begin() + offset);
return results;
},
future);
}
state KeyBackedRangeResult<std::pair<TenantName, int64_t>> matchingTenants =
wait(ManagementClusterMetadata::tenantMetadata().tenantNameIndex.getRange(tr, begin, end, limit));
template <class DB>
Future<std::vector<std::pair<TenantName, int64_t>>> listTenants(Reference<DB> db,
TenantName begin,
TenantName end,
int limit,
int offset = 0) {
return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return listTenantsTransaction(tr, begin, end, limit, offset);
});
}
state std::vector<Future<TenantMapEntry>> tenantEntryFutures;
for (auto const& [name, id] : matchingTenants.results) {
tenantEntryFutures.push_back(getTenantTransaction(tr, id));
// Scan the tenant index to get a list of tenant IDs, and then lookup the metadata for each ID individually
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(
Transaction tr,
std::vector<std::pair<TenantName, int64_t>> tenantIds) {
state int idIdx = 0;
state std::vector<Future<Optional<TenantMapEntry>>> futures;
for (; idIdx < tenantIds.size(); ++idIdx) {
futures.push_back(MetaclusterAPI::tryGetTenantTransaction(tr, tenantIds[idIdx].second));
}
wait(waitForAll(tenantEntryFutures));
wait(waitForAll(futures));
std::vector<std::pair<TenantName, TenantMapEntry>> results;
for (int i = 0; i < matchingTenants.results.size(); ++i) {
// Tenants being renamed will show up twice; once under each name
results.emplace_back(matchingTenants.results[i].first, tenantEntryFutures[i].get());
results.reserve(futures.size());
for (int i = 0; i < futures.size(); ++i) {
const TenantMapEntry& entry = futures[i].get().get();
results.emplace_back(entry.tenantName, entry);
}
return results;
}
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(Transaction tr,
TenantNameRef begin,
TenantNameRef end,
int limit) {
std::vector<std::pair<TenantName, int64_t>> matchingTenants = wait(listTenantsTransaction(tr, begin, end, limit));
std::vector<std::pair<TenantName, TenantMapEntry>> results =
wait(listTenantMetadataTransaction(tr, matchingTenants));
return results;
}
ACTOR template <class DB>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadata(
Reference<DB> db,
TenantName begin,
TenantName end,
@ -1586,30 +1624,24 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
if (filters.empty()) {
wait(store(results, listTenantsTransaction(tr, begin, end, limit + offset)));
if (offset >= results.size()) {
results.clear();
} else if (offset > 0) {
results.erase(results.begin(), results.begin() + offset);
}
std::vector<std::pair<TenantName, int64_t>> ids =
wait(MetaclusterAPI::listTenantsTransaction(tr, begin, end, limit, offset));
wait(store(results, MetaclusterAPI::listTenantMetadataTransaction(tr, ids)));
return results;
}
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
// read in batch
state int count = 0;
loop {
std::vector<std::pair<TenantName, TenantMapEntry>> tenantBatch =
wait(listTenantsTransaction(tr, begin, end, std::max(limit + offset, 1000)));
wait(MetaclusterAPI::listTenantMetadataTransaction(tr, begin, end, std::max(limit + offset, 1000)));
if (tenantBatch.empty()) {
return results;
}
for (auto const& [name, entry] : tenantBatch) {
if (filters.empty() || std::count(filters.begin(), filters.end(), entry.tenantState)) {
if (std::count(filters.begin(), filters.end(), entry.tenantState)) {
++count;
if (count > offset) {
results.push_back(std::make_pair(name, entry));

View File

@ -71,6 +71,7 @@ struct NetworkOptions {
std::string traceClockSource;
std::string traceFileIdentifier;
std::string tracePartialFileSuffix;
bool traceInitializeOnSetup;
Optional<bool> logClientInfo;
Reference<ReferencedObject<Standalone<VectorRef<ClientVersionRef>>>> supportedVersions;
bool runLoopProfilingEnabled;
@ -247,6 +248,7 @@ public:
Future<Void> ready() const { return success(idFuture); }
int64_t id() const;
Future<int64_t> getIdFuture() const;
KeyRef prefix() const;
std::string description() const;

View File

@ -21,15 +21,53 @@
#pragma once
#include <map>
#include <unordered_map>
#include <functional>
#include "flow/IRandom.h"
#include "flow/flow.h"
#include "flow/Net2Packet.h"
#include "fdbclient/Knobs.h"
#include "flow/IRateControl.h"
#include "fdbrpc/HTTP.h"
#include "fdbrpc/Stats.h"
#include "fdbclient/JSONDoc.h"
#include "flow/IConnection.h"
#include <boost/functional/hash.hpp>
// unique key that indentifies interchangeable connections for the same settings and destination
// FIXME: can we define std::hash as a struct member of a S3BlobStoreEndpoint?
struct BlobStoreConnectionPoolKey {
std::string host;
std::string service;
std::string region;
bool isTLS;
BlobStoreConnectionPoolKey(const std::string& host,
const std::string& service,
const std::string& region,
bool isTLS)
: host(host), service(service), region(region), isTLS(isTLS) {}
bool operator==(const BlobStoreConnectionPoolKey& other) const {
return isTLS == other.isTLS && host == other.host && service == other.service && region == other.region;
}
};
namespace std {
template <>
struct hash<BlobStoreConnectionPoolKey> {
std::size_t operator()(const BlobStoreConnectionPoolKey& key) const {
std::size_t seed = 0;
boost::hash_combine(seed, std::hash<std::string>{}(key.host));
boost::hash_combine(seed, std::hash<std::string>{}(key.service));
boost::hash_combine(seed, std::hash<std::string>{}(key.region));
boost::hash_combine(seed, std::hash<bool>{}(key.isTLS));
return seed;
}
};
} // namespace std
// Representation of all the things you need to connect to a blob store instance with some credentials.
// Reference counted because a very large number of them could be needed.
class S3BlobStoreEndpoint : public ReferenceCounted<S3BlobStoreEndpoint> {
@ -47,6 +85,54 @@ public:
static Stats s_stats;
struct BlobStats {
UID id;
CounterCollection cc;
Counter requestsSuccessful;
Counter requestsFailed;
Counter newConnections;
Counter expiredConnections;
Counter reusedConnections;
Counter fastRetries;
LatencySample requestLatency;
// init not in static codepath, to avoid initialization race issues and so no blob connections means no
// unecessary blob stats traces
BlobStats()
: id(deterministicRandom()->randomUniqueID()), cc("BlobStoreStats", id.toString()),
requestsSuccessful("RequestsSuccessful", cc), requestsFailed("RequestsFailed", cc),
newConnections("NewConnections", cc), expiredConnections("ExpiredConnections", cc),
reusedConnections("ReusedConnections", cc), fastRetries("FastRetries", cc),
requestLatency("BlobStoreRequestLatency",
id,
CLIENT_KNOBS->BLOBSTORE_LATENCY_LOGGING_INTERVAL,
CLIENT_KNOBS->BLOBSTORE_LATENCY_LOGGING_ACCURACY) {}
};
// null when initialized, so no blob stats until a blob connection is used
static std::unique_ptr<BlobStats> blobStats;
static Future<Void> statsLogger;
void maybeStartStatsLogger() {
if (!blobStats && CLIENT_KNOBS->BLOBSTORE_ENABLE_LOGGING) {
blobStats = std::make_unique<BlobStats>();
specialCounter(
blobStats->cc, "GlobalConnectionPoolCount", [this]() { return this->globalConnectionPool.size(); });
specialCounter(blobStats->cc, "GlobalConnectionPoolSize", [this]() {
// FIXME: could track this explicitly via an int variable with extra logic, but this should be small and
// infrequent
int totalConnections = 0;
for (auto& it : this->globalConnectionPool) {
totalConnections += it.second->pool.size();
}
return totalConnections;
});
statsLogger = blobStats->cc.traceCounters(
"BlobStoreMetrics", blobStats->id, CLIENT_KNOBS->BLOBSTORE_STATS_LOGGING_INTERVAL, "BlobStoreMetrics");
}
}
struct Credentials {
std::string key;
std::string secret;
@ -60,7 +146,7 @@ public:
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
enable_read_cache, read_block_size, read_ahead_blocks, read_cache_blocks_per_file,
max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth;
max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth, global_connection_pool;
bool set(StringRef name, int value);
std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() {
@ -95,11 +181,27 @@ public:
"max_recv_bytes_per_second (or rbps) Max receive bytes per second for all requests combined (NOT YET "
"USED).",
"sdk_auth (or sa) Use AWS SDK to resolve credentials. Only valid if "
"BUILD_AWS_BACKUP is enabled."
"BUILD_AWS_BACKUP is enabled.",
"global_connection_pool (or gcp) Enable shared connection pool between all blobstore instances."
};
}
bool isTLS() const { return secure_connection == 1; }
};
struct ReusableConnection {
Reference<IConnection> conn;
double expirationTime;
};
// basically, reference counted queue with option to add other fields
struct ConnectionPoolData : NonCopyable, ReferenceCounted<ConnectionPoolData> {
std::queue<ReusableConnection> pool;
};
// global connection pool for multiple blobstore endpoints with same connection settings and request destination
static std::unordered_map<BlobStoreConnectionPoolKey, Reference<ConnectionPoolData>> globalConnectionPool;
S3BlobStoreEndpoint(std::string const& host,
std::string const& service,
std::string region,
@ -123,15 +225,34 @@ public:
if (host.empty() || (proxyHost.present() != proxyPort.present()))
throw connection_string_invalid();
// set connection pool instance
if (useProxy || !knobs.global_connection_pool) {
// don't use global connection pool if there's a proxy, as it complicates the logic
// FIXME: handle proxies?
connectionPool = makeReference<ConnectionPoolData>();
} else {
BlobStoreConnectionPoolKey key(host, service, region, knobs.isTLS());
auto it = globalConnectionPool.find(key);
if (it != globalConnectionPool.end()) {
connectionPool = it->second;
} else {
connectionPool = makeReference<ConnectionPoolData>();
globalConnectionPool.insert({ key, connectionPool });
}
}
ASSERT(connectionPool.isValid());
maybeStartStatsLogger();
}
static std::string getURLFormat(bool withResource = false) {
const char* resource = "";
if (withResource)
resource = "<name>";
return format(
"blobstore://<api_key>:<secret>:<security_token>@<host>[:<port>]/%s[?<param>=<value>[&<param>=<value>]...]",
resource);
return format("blobstore://<api_key>:<secret>:<security_token>@<host>[:<port>]/"
"%s[?<param>=<value>[&<param>=<value>]...]",
resource);
}
typedef std::map<std::string, std::string> ParametersT;
@ -149,11 +270,9 @@ public:
// parameters in addition to the passed params string
std::string getResourceURL(std::string resource, std::string params) const;
struct ReusableConnection {
Reference<IConnection> conn;
double expirationTime;
};
std::queue<ReusableConnection> connectionPool;
// FIXME: add periodic connection reaper to pool
// local connection pool for this blobstore
Reference<ConnectionPoolData> connectionPool;
Future<ReusableConnection> connect(bool* reusingConn);
void returnConnection(ReusableConnection& conn);

View File

@ -791,6 +791,7 @@ public:
std::string STORAGESERVER_READ_PRIORITIES;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
int SPLIT_METRICS_MAX_ROWS;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -937,7 +938,6 @@ public:
double REDWOOD_HISTOGRAM_INTERVAL;
bool REDWOOD_EVICT_UPDATED_PAGES; // Whether to prioritize eviction of updated pages from cache.
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_IO_PRIORITIES;
@ -990,11 +990,18 @@ public:
int BG_RDC_BYTES_FACTOR;
int BG_RDC_READ_FACTOR;
bool BG_WRITE_MULTIPART;
bool BG_ENABLE_DYNAMIC_WRITE_AMP;
double BG_DYNAMIC_WRITE_AMP_MIN_FACTOR;
double BG_DYNAMIC_WRITE_AMP_DECREASE_FACTOR;
int BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM;
int BLOB_WORKER_RESNAPSHOT_PARALLELISM;
int BLOB_WORKER_DELTA_FILE_WRITE_PARALLELISM;
int BLOB_WORKER_RDC_PARALLELISM;
// The resnapshot/delta parallelism knobs are deprecated and replaced by the budget_bytes knobs! FIXME: remove after
// next release
int64_t BLOB_WORKER_RESNAPSHOT_BUDGET_BYTES;
int64_t BLOB_WORKER_DELTA_WRITE_BUDGET_BYTES;
double BLOB_WORKER_TIMEOUT; // Blob Manager's reaction time to a blob worker failure
double BLOB_WORKER_REQUEST_TIMEOUT; // Blob Worker's server-side request timeout
@ -1017,6 +1024,7 @@ public:
int BLOB_MANIFEST_RW_ROWS;
std::string BLOB_RESTORE_MLOGS_URL;
int BLOB_MIGRATOR_ERROR_RETRIES;
std::string BLOB_RESTORE_MANIFEST_URL;
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;

View File

@ -160,9 +160,9 @@ struct DataMoveMetaData {
void setPhase(Phase phase) { this->phase = static_cast<int16_t>(phase); }
std::string toString() const {
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + " [Range]: " + describe(ranges) +
" [Phase]: " + std::to_string(static_cast<int>(phase)) +
" [Source Servers]: " + describe(src) + " [Destination Servers]: " + describe(dest);
std::string res = "DataMoveMetaData: [ID]: " + id.shortString() + ", [Range]: " + describe(ranges) +
", [Phase]: " + std::to_string(static_cast<int>(phase)) +
", [Source Servers]: " + describe(src) + ", [Destination Servers]: " + describe(dest);
return res;
}
@ -172,4 +172,4 @@ struct DataMoveMetaData {
}
};
#endif
#endif

View File

@ -740,10 +740,11 @@ struct SplitMetricsReply {
constexpr static FileIdentifier file_identifier = 11530792;
Standalone<VectorRef<KeyRef>> splits;
StorageMetrics used;
bool more = false;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, splits, used);
serializer(ar, splits, used, more);
}
};

View File

@ -220,7 +220,7 @@ private:
if (!cache->lastTenantId.present()) {
return false;
}
return cache->lastTenantId.get() > 0;
return cache->lastTenantId.get() >= 0;
}
return true;
}

View File

@ -474,18 +474,37 @@ Future<Void> configureTenantTransaction(Transaction tr,
return Void();
}
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransaction(Transaction tr,
TenantName begin,
TenantName end,
int limit) {
template <class Transaction>
Future<std::vector<std::pair<TenantName, int64_t>>> listTenantsTransaction(Transaction tr,
TenantName begin,
TenantName end,
int limit) {
tr->setOption(FDBTransactionOptions::RAW_ACCESS);
auto future = TenantMetadata::tenantNameIndex().getRange(tr, begin, end, limit);
return fmap([](auto f) -> std::vector<std::pair<TenantName, int64_t>> { return f.results; }, future);
}
KeyBackedRangeResult<std::pair<TenantName, int64_t>> matchingTenants =
wait(TenantMetadata::tenantNameIndex().getRange(tr, begin, end, limit));
template <class DB>
Future<std::vector<std::pair<TenantName, int64_t>>> listTenants(Reference<DB> db,
TenantName begin,
TenantName end,
int limit) {
return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
return listTenantsTransaction(tr, begin, end, limit);
});
}
ACTOR template <class Transaction>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadataTransaction(Transaction tr,
TenantName begin,
TenantName end,
int limit) {
std::vector<std::pair<TenantName, int64_t>> matchingTenants = wait(listTenantsTransaction(tr, begin, end, limit));
state std::vector<Future<TenantMapEntry>> tenantEntryFutures;
for (auto const& [name, id] : matchingTenants.results) {
for (auto const& [name, id] : matchingTenants) {
tenantEntryFutures.push_back(getTenantTransaction(tr, id));
}
@ -499,24 +518,16 @@ Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantsTransactio
return results;
}
ACTOR template <class DB>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenants(Reference<DB> db,
TenantName begin,
TenantName end,
int limit) {
state Reference<typename DB::TransactionT> tr = db->createTransaction();
loop {
try {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::READ_LOCK_AWARE);
std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
wait(listTenantsTransaction(tr, begin, end, limit));
return tenants;
} catch (Error& e) {
wait(safeThreadFutureToFuture(tr->onError(e)));
}
}
template <class DB>
Future<std::vector<std::pair<TenantName, TenantMapEntry>>> listTenantMetadata(Reference<DB> db,
TenantName begin,
TenantName end,
int limit) {
return runTransaction(db, [=](Reference<typename DB::TransactionT> tr) {
tr->setOption(FDBTransactionOptions::LOCK_AWARE);
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
return listTenantMetadataTransaction(tr, begin, end, limit);
});
}
ACTOR template <class Transaction>

View File

@ -70,7 +70,7 @@ private:
RangeResult* results,
GetRangeLimits limitsHint) {
std::vector<std::pair<TenantName, TenantMapEntry>> tenants =
wait(TenantAPI::listTenantsTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows));
wait(TenantAPI::listTenantMetadataTransaction(&ryw->getTransaction(), kr.begin, kr.end, limitsHint.rows));
for (auto tenant : tenants) {
std::string jsonString = tenant.second.toJson();
@ -202,7 +202,7 @@ private:
TenantName beginTenant,
TenantName endTenant,
std::map<TenantGroupName, int>* tenantGroupNetTenantDelta) {
state std::vector<std::pair<TenantName, TenantMapEntry>> tenants = wait(
state std::vector<std::pair<TenantName, int64_t>> tenants = wait(
TenantAPI::listTenantsTransaction(&ryw->getTransaction(), beginTenant, endTenant, CLIENT_KNOBS->TOO_MANY));
if (tenants.size() == CLIENT_KNOBS->TOO_MANY) {

View File

@ -351,6 +351,41 @@ void removeRoot(Reference<PTree<T>>& p, Version at) {
}
}
// changes p to point to a PTree with finger removed. p must be the root of the
// tree associated with finger.
//
// Invalidates finger.
template <class T>
void removeFinger(Reference<PTree<T>>& p, Version at, PTreeFinger<T> finger) {
ASSERT_GT(finger.size(), 0);
// Start at the end of the finger, remove, and propagate copies up along the
// search path (finger) as needed.
auto node = Reference<PTree<T>>::addRef(const_cast<PTree<T>*>(finger.back()));
auto* before = node.getPtr();
removeRoot(node, at);
for (;;) {
if (before == node.getPtr()) {
// Done propagating copies
return;
}
if (finger.size() == 1) {
// Check we passed the correct root for this finger
ASSERT(p.getPtr() == before);
// Propagate copy to root
p = node;
return;
}
finger.pop_back();
auto parent = Reference<PTree<T>>::addRef(const_cast<PTree<T>*>(finger.back()));
bool isLeftChild = parent->left(at).getPtr() == before;
bool isRightChild = parent->right(at).getPtr() == before;
ASSERT(isLeftChild || isRightChild); // Corrupt finger?
// Prepare for next iteration
before = parent.getPtr();
node = update(parent, isRightChild, node, at);
}
}
// changes p to point to a PTree with x removed
template <class T, class X>
void remove(Reference<PTree<T>>& p, Version at, const X& x) {
@ -745,9 +780,8 @@ public:
PTreeImpl::remove(roots.back().second, latestVersion, key);
}
void erase(iterator const& item) { // iterator must be in latest version!
// SOMEDAY: Optimize to use item.finger and avoid repeated search
K key = item.key();
erase(key);
ASSERT_EQ(item.at, latestVersion);
PTreeImpl::removeFinger(roots.back().second, latestVersion, item.finger);
}
void printDetail() { PTreeImpl::printTreeDetails(roots.back().second, 0); }

View File

@ -59,6 +59,8 @@ description is not currently required but encouraged.
description="Once provided, this string will be used to replace the port/PID in the log file names." />
<Option name="trace_share_among_client_threads" code="37"
description="Use the same base trace file name for all client threads as it did before version 7.2. The current default behavior is to use distinct trace file names for client threads by including their version and thread index." />
<Option name="trace_initialize_on_setup" code="38"
description="Initialize trace files on network setup, determine the local IP later. Otherwise tracing is initialized when opening the first database." />
<Option name="trace_partial_file_suffix" code="39"
paramType="String" paramDescription="Append this suffix to partially written log files. When a log file is complete, it is renamed to remove the suffix. No separator is added between the file and the suffix. If you want to add a file extension, you should include the separator - e.g. '.tmp' instead of 'tmp' to add the 'tmp' extension."
description="Set file suffix for partially written log files." />
@ -216,7 +218,8 @@ description is not currently required but encouraged.
<Option name="use_config_database" code="800"
description="Use configuration database." />
<Option name="test_causal_read_risky" code="900"
description="An integer between 0 and 100 (default is 0) expressing the probability that a client will verify it can't read stale data whenever it detects a recovery." />
paramType="Int" paramDescription="integer between 0 and 100 expressing the probability a client will verify it can't read stale data"
description="Enables verification of causal read risky by checking whether clients are able to read stale data when they detect a recovery, and logging an error if so." />
</Scope>
<Scope name="TransactionOption">

View File

@ -22,7 +22,9 @@
// Define boost::asio::io_service
#include <algorithm>
#ifndef BOOST_SYSTEM_NO_LIB
#define BOOST_SYSTEM_NO_LIB
#endif
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include <boost/asio.hpp>

View File

@ -18,7 +18,9 @@
* limitations under the License.
*/
#ifndef BOOST_SYSTEM_NO_LIB
#define BOOST_SYSTEM_NO_LIB
#endif
#define BOOST_DATE_TIME_NO_LIB
#define BOOST_REGEX_NO_LIB
#include <boost/asio.hpp>

View File

@ -127,10 +127,23 @@ public:
int rlen = readEnd - readStart;
memcpy((uint8_t*)data + wpos, block->data + readStart, rlen);
wpos += rlen;
// unpin this block
localCache.erase(blockNum);
if (f->m_blocks.size() > f->m_cache_block_limit) {
// make an attempt to free no-longer needed blocks as we go
// FIXME: could also expire previous blocks if above limit and they're also free
auto i = f->m_blocks.find(blockNum);
ASSERT(i != f->m_blocks.end() && i->first == blockNum);
if (i->second.getFutureReferenceCount() == 1) {
// printf("evicting block %d\n", i->first);
i = f->m_blocks.erase(i);
}
}
}
ASSERT(wpos == length);
localCache.clear();
ASSERT(localCache.empty());
// If the cache is too large then go through the cache in block number order and remove any entries whose future
// has a reference count of 1, stopping once the cache is no longer too big. There is no point in removing

Some files were not shown because too many files have changed in this diff Show More