Merge branch 'main' of github.com:apple/foundationdb into tenant-list-filter

This commit is contained in:
Jon Fu 2022-11-14 12:30:28 -08:00
commit 25e1721e75
64 changed files with 1649 additions and 796 deletions

View File

@ -70,10 +70,13 @@ void ApiWorkload::start() {
schedule([this]() {
// 1. Clear data
clearData([this]() {
// 2. Populate initial data
populateData([this]() {
// 3. Generate random workload
runTests();
// 2. Workload setup
setup([this]() {
// 3. Populate initial data
populateData([this]() {
// 4. Generate random workload
runTests();
});
});
});
});
@ -249,6 +252,10 @@ void ApiWorkload::populateData(TTaskFct cont) {
}
}
void ApiWorkload::setup(TTaskFct cont) {
schedule(cont);
}
void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<fdb::KeyValue>>();
@ -322,4 +329,85 @@ std::optional<fdb::BytesRef> ApiWorkload::getTenant(std::optional<int> tenantId)
}
}
std::string ApiWorkload::debugTenantStr(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format("(tenant {0})", tenantId.value()) : "()";
}
// BlobGranule setup.
// This blobbifies ['\x00', '\xff') per tenant or for the whole database if there are no tenants.
void ApiWorkload::setupBlobGranules(TTaskFct cont) {
// This count is used to synchronize the # of tenant blobbifyRange() calls to ensure
// we only start the workload once blobbification has fully finished.
auto blobbifiedCount = std::make_shared<std::atomic<int>>(1);
if (tenants.empty()) {
blobbifiedCount->store(1);
blobbifyTenant({}, blobbifiedCount, cont);
} else {
blobbifiedCount->store(tenants.size());
for (int i = 0; i < tenants.size(); i++) {
schedule([=]() { blobbifyTenant(i, blobbifiedCount, cont); });
}
}
}
void ApiWorkload::blobbifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retBlobbifyRange = std::make_shared<bool>(false);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: blobbifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->blobbifyRange(begin, end).eraseType();
ctx->continueAfter(f, [ctx, retBlobbifyRange, f]() {
*retBlobbifyRange = f.get<fdb::future_var::Bool>();
ctx->done();
});
},
[=]() {
if (!*retBlobbifyRange) {
schedule([=]() { blobbifyTenant(tenantId, blobbifiedCount, cont); });
} else {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
void ApiWorkload::verifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retVerifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: verifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, /*latest_version*/ -2).eraseType();
ctx->continueAfter(f, [ctx, retVerifyVersion, f]() {
*retVerifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
});
},
[=]() {
if (*retVerifyVersion == -1) {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
} else {
if (blobbifiedCount->fetch_sub(1) == 1) {
schedule(cont);
}
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
} // namespace FdbApiTester

View File

@ -41,6 +41,9 @@ public:
virtual void checkProgress() override;
// Workload specific setup phase.
virtual void setup(TTaskFct cont);
// Running specific tests
// The default implementation generates a workload consisting of
// random operations generated by randomOperation
@ -126,6 +129,12 @@ protected:
void randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId);
std::optional<fdb::BytesRef> getTenant(std::optional<int> tenantId);
std::string debugTenantStr(std::optional<int> tenantId);
// Generic BlobGranules setup.
void setupBlobGranules(TTaskFct cont);
void blobbifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
void verifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
private:
void populateDataTx(TTaskFct cont, std::optional<int> tenantId);

View File

@ -52,26 +52,23 @@ private:
};
std::vector<OpType> excludedOpTypes;
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}",
info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
tenantDebugString(tenantId),
debugTenantStr(tenantId),
message));
}
}
@ -117,7 +114,7 @@ private:
results.get()->assign(resVector.begin(), resVector.end());
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
@ -289,20 +286,19 @@ private:
}
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) {
void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
if (begin > end) {
std::swap(begin, end);
}
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
ctx->continueAfter(f, [ctx, f, results]() {
*results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
ctx->done();
@ -314,25 +310,24 @@ private:
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
// TODO: tenant support
void randomVerifyOp(TTaskFct cont) {
void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) {
std::swap(begin, end);
}
auto verifyVersion = std::make_shared<int64_t>(false);
debugOp("Verify", begin, end, tenantId, "starting");
auto verifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[begin, end, verifyVersion](auto ctx) {
fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
ctx->continueAfter(f, [ctx, verifyVersion, f]() {
*verifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
@ -344,15 +339,16 @@ private:
if (*verifyVersion == -1) {
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
}
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
void randomOperation(TTaskFct cont) {
void randomOperation(TTaskFct cont) override {
std::optional<int> tenantId = randomTenant();
OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
@ -380,10 +376,10 @@ private:
randomSummarizeOp(cont, tenantId);
break;
case OP_GET_BLOB_RANGES:
randomGetBlobRangesOp(cont);
randomGetBlobRangesOp(cont, tenantId);
break;
case OP_VERIFY:
randomVerifyOp(cont);
randomVerifyOp(cont, tenantId);
break;
}
}

View File

@ -47,6 +47,8 @@ private:
OP_LAST = OP_CANCEL_PURGE
};
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// could add summarize too old and verify too old as ops if desired but those are lower value
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet

View File

@ -91,13 +91,15 @@ public:
fdbDb = executor->selectDatabase();
}
if (tenantName) {
fdbTenant = fdbDb.openTenant(*tenantName);
fdbDbOps = std::make_shared<fdb::Tenant>(fdbTenant);
} else {
fdbDbOps = std::make_shared<fdb::Database>(fdbDb);
}
if (transactional) {
if (tenantName) {
fdb::Tenant tenant = fdbDb.openTenant(*tenantName);
fdbTx = tenant.createTransaction();
} else {
fdbTx = fdbDb.createTransaction();
}
fdbTx = fdbDbOps->createTransaction();
}
}
@ -109,6 +111,10 @@ public:
fdb::Database db() override { return fdbDb.atomic_load(); }
fdb::Tenant tenant() override { return fdbTenant.atomic_load(); }
std::shared_ptr<fdb::IDatabaseOps> dbOps() override { return std::atomic_load(&fdbDbOps); }
fdb::Transaction tx() override { return fdbTx.atomic_load(); }
// Set a continuation to be executed when a future gets ready
@ -272,13 +278,17 @@ protected:
scheduler->schedule([thisRef]() {
fdb::Database db = thisRef->executor->selectDatabase();
thisRef->fdbDb.atomic_store(db);
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTenant.atomic_store(tenant);
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Tenant>(tenant)));
} else {
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Database>(db)));
}
if (thisRef->transactional) {
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTx.atomic_store(tenant.createTransaction());
} else {
thisRef->fdbTx.atomic_store(db.createTransaction());
}
thisRef->fdbTx.atomic_store(thisRef->fdbDbOps->createTransaction());
}
thisRef->restartTransaction();
});
@ -317,6 +327,14 @@ protected:
// Provides a thread safe interface by itself (no need for mutex)
fdb::Database fdbDb;
// FDB tenant
// Provides a thread safe interface by itself (no need for mutex)
fdb::Tenant fdbTenant;
// FDB IDatabaseOps to hide database/tenant accordingly.
// Provides a shared pointer to database functions based on if db or tenant.
std::shared_ptr<fdb::IDatabaseOps> fdbDbOps;
// FDB transaction
// Provides a thread safe interface by itself (no need for mutex)
fdb::Transaction fdbTx;

View File

@ -41,6 +41,12 @@ public:
// Current FDB database
virtual fdb::Database db() = 0;
// Current FDB tenant
virtual fdb::Tenant tenant() = 0;
// Current FDB IDatabaseOps
virtual std::shared_ptr<fdb::IDatabaseOps> dbOps() = 0;
// Current FDB transaction
virtual fdb::Transaction tx() = 0;

View File

@ -117,8 +117,11 @@ void WorkloadBase::execTransaction(TOpStartFct startFct,
}
// Execute a non-transactional database operation within the workload
void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) {
doExecute(startFct, cont, {}, failOnError, false);
void WorkloadBase::execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant,
bool failOnError) {
doExecute(startFct, cont, tenant, failOnError, false);
}
void WorkloadBase::doExecute(TOpStartFct startFct,

View File

@ -125,7 +125,10 @@ protected:
bool failOnError = true);
// Execute a non-transactional database operation within the workload
void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true);
void execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
bool failOnError = true);
// Log an error message, increase error counter
void error(const std::string& msg);

View File

@ -677,7 +677,28 @@ public:
}
};
class Tenant final {
// Handle this as an abstract class instead of interface to preserve lifetime of fdb objects owned by Tenant and
// Database.
class IDatabaseOps {
public:
virtual ~IDatabaseOps() = default;
virtual Transaction createTransaction() = 0;
virtual TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin,
KeyRef end,
int rangeLimit) = 0;
virtual TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) = 0;
virtual TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin,
KeyRef end,
int64_t version,
bool force) = 0;
virtual TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) = 0;
};
class Tenant final : public IDatabaseOps {
friend class Database;
std::shared_ptr<native::FDBTenant> tenant;
@ -694,6 +715,14 @@ public:
Tenant& operator=(const Tenant&) noexcept = default;
Tenant() noexcept : tenant(nullptr) {}
void atomic_store(Tenant other) { std::atomic_store(&tenant, other.tenant); }
Tenant atomic_load() {
Tenant retVal;
retVal.tenant = std::atomic_load(&tenant);
return retVal;
}
static void createTenant(Transaction tr, BytesRef name) {
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef());
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef());
@ -715,7 +744,7 @@ public:
return tr.get(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), false);
}
Transaction createTransaction() {
Transaction createTransaction() override {
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
auto err = Error(native::fdb_tenant_create_transaction(tenant.get(), &tx_native));
if (err)
@ -723,14 +752,49 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant");
throw std::runtime_error("blobbifyRange() from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("unblobbifyRange() from null tenant");
return native::fdb_tenant_unblobbify_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!tenant)
throw std::runtime_error("listBlobbifiedRanges() from null tenant");
return native::fdb_tenant_list_blobbified_ranges(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!tenant)
throw std::runtime_error("verifyBlobRange() from null tenant");
return native::fdb_tenant_verify_blob_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!tenant)
throw std::runtime_error("purgeBlobGranules() from null tenant");
native::fdb_bool_t forceBool = force;
return native::fdb_tenant_purge_blob_granules(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!tenant)
throw std::runtime_error("waitPurgeGranulesComplete() from null tenant");
return native::fdb_tenant_wait_purge_granules_complete(tenant.get(), purgeKey.data(), intSize(purgeKey));
}
};
class Database {
class Database : public IDatabaseOps {
friend class Tenant;
std::shared_ptr<native::FDBDatabase> db;
@ -789,7 +853,7 @@ public:
return Tenant(tenant_native);
}
Transaction createTransaction() {
Transaction createTransaction() override {
if (!db)
throw std::runtime_error("create_transaction from null database");
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
@ -799,33 +863,33 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) {
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!db)
throw std::runtime_error("listBlobbifiedRanges from null database");
return native::fdb_database_list_blobbified_ranges(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) {
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!db)
throw std::runtime_error("verifyBlobRange from null database");
return native::fdb_database_verify_blob_range(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("blobbifyRange from null database");
return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("unblobbifyRange from null database");
return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) {
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
native::fdb_bool_t forceBool = force;
@ -833,7 +897,7 @@ public:
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) {
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey));

View File

@ -1,12 +1,12 @@
#!/usr/bin/env python3
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import argparse
from pathlib import Path
import platform
import shutil
import subprocess
import sys
import os
import glob
import unittest
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "tests", "TestRunner")]
@ -18,6 +18,9 @@ from local_cluster import LocalCluster, random_secret_string
PREV_RELEASE_VERSION = "7.1.5"
PREV_PREV_RELEASE_VERSION = "7.0.0"
args = None
downloader = None
def version_from_str(ver_str):
ver = [int(s) for s in ver_str.split(".")]
@ -30,11 +33,9 @@ def api_version_from_str(ver_str):
return ver_tuple[0] * 100 + ver_tuple[1] * 10
class TestEnv(LocalCluster):
class TestCluster(LocalCluster):
def __init__(
self,
args,
downloader: FdbBinaryDownloader,
version: str,
):
self.client_config_tester_bin = Path(args.client_config_tester_bin).resolve()
@ -44,35 +45,33 @@ class TestEnv(LocalCluster):
assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
self.tmp_dir = self.build_dir.joinpath("tmp", random_secret_string(16))
self.tmp_dir.mkdir(parents=True)
self.downloader = downloader
self.version = version
super().__init__(
self.tmp_dir,
self.downloader.binary_path(version, "fdbserver"),
self.downloader.binary_path(version, "fdbmonitor"),
self.downloader.binary_path(version, "fdbcli"),
downloader.binary_path(version, "fdbserver"),
downloader.binary_path(version, "fdbmonitor"),
downloader.binary_path(version, "fdbcli"),
1,
)
self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version))
self.failed_cnt = 0
self.set_env_var("LD_LIBRARY_PATH", downloader.lib_dir(version))
def __enter__(self):
super().__enter__()
super().create_database()
return self
def setup(self):
self.__enter__()
self.create_database()
def __exit__(self, xc_type, exc_value, traceback):
super().__exit__(xc_type, exc_value, traceback)
def tearDown(self):
self.__exit__(None, None, None)
shutil.rmtree(self.tmp_dir)
# Client configuration tests using a cluster of the current version
class ClientConfigTest:
def __init__(self, test_env: TestEnv, title: str):
self.test_env = test_env
self.title = title
def __init__(self, tc: unittest.TestCase):
self.tc = tc
self.cluster = tc.cluster
self.external_lib_dir = None
self.external_lib_path = None
self.test_dir = self.test_env.tmp_dir.joinpath(random_secret_string(16))
self.test_dir = self.cluster.tmp_dir.joinpath(random_secret_string(16))
self.test_dir.mkdir(parents=True)
self.log_dir = self.test_dir.joinpath("log")
self.log_dir.mkdir(parents=True)
@ -88,31 +87,28 @@ class ClientConfigTest:
self.external_lib_dir = self.test_dir.joinpath("extclients")
self.external_lib_dir.mkdir(parents=True)
for version in versions:
src_file_path = self.test_env.downloader.lib_path(version)
assert src_file_path.exists(), "{} does not exist".format(src_file_path)
src_file_path = downloader.lib_path(version)
self.tc.assertTrue(src_file_path.exists(), "{} does not exist".format(src_file_path))
target_file_path = self.external_lib_dir.joinpath("libfdb_c.{}.so".format(version))
shutil.copyfile(src_file_path, target_file_path)
assert target_file_path.exists(), "{} does not exist".format(target_file_path)
self.tc.assertTrue(target_file_path.exists(), "{} does not exist".format(target_file_path))
def create_external_lib_path(self, version):
src_file_path = self.test_env.downloader.lib_path(version)
assert src_file_path.exists(), "{} does not exist".format(src_file_path)
src_file_path = downloader.lib_path(version)
self.tc.assertTrue(src_file_path.exists(), "{} does not exist".format(src_file_path))
self.external_lib_path = self.test_dir.joinpath("libfdb_c.{}.so".format(version))
shutil.copyfile(src_file_path, self.external_lib_path)
assert self.external_lib_path.exists(), "{} does not exist".format(self.external_lib_path)
self.tc.assertTrue(self.external_lib_path.exists(), "{} does not exist".format(self.external_lib_path))
def dump_client_logs(self):
for log_file in glob.glob(os.path.join(self.log_dir, "*")):
print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(log_file))
print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(log_file), file=sys.stderr)
with open(log_file, "r") as f:
print(f.read())
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file))
print(f.read(), file=sys.stderr)
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file), file=sys.stderr)
def exec(self):
print("-" * 80)
print(self.title)
print("-" * 80)
cmd_args = [self.test_env.client_config_tester_bin, "--cluster-file", self.test_env.cluster_file]
cmd_args = [self.cluster.client_config_tester_bin, "--cluster-file", self.cluster.cluster_file]
if self.tmp_dir is not None:
cmd_args += ["--tmp-dir", self.tmp_dir]
@ -141,61 +137,66 @@ class ClientConfigTest:
if self.transaction_timeout is not None:
cmd_args += ["--transaction-timeout", str(self.transaction_timeout)]
print("Executing test command: {}".format(" ".join([str(c) for c in cmd_args])))
tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr)
tester_retcode = tester_proc.wait()
if tester_retcode != 0:
print("Test '{}' failed".format(self.title))
self.test_env.failed_cnt += 1
self.cleanup()
print("\nExecuting test command: {}".format(" ".join([str(c) for c in cmd_args])), file=sys.stderr)
try:
tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr)
tester_retcode = tester_proc.wait()
self.tc.assertEqual(0, tester_retcode)
finally:
self.cleanup()
def cleanup(self):
shutil.rmtree(self.test_dir)
class ClientConfigTests:
def __init__(self, args):
self.args = args
self.downloader = FdbBinaryDownloader(args.build_dir)
# binary downloads are currently available only for x86_64
self.platform = platform.machine()
if self.platform == "x86_64":
self.downloader.download_old_binaries(PREV_RELEASE_VERSION)
self.downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION)
class ClientConfigTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.cluster = TestCluster(CURRENT_VERSION)
cls.cluster.setup()
def test_local_client_only(self, test_env):
test = ClientConfigTest(test_env, "Local client only")
@classmethod
def tearDownClass(cls):
cls.cluster.tearDown()
def test_local_client_only(self):
# Local client only
test = ClientConfigTest(self)
test.exec()
def test_single_external_client_only(self, test_env):
test = ClientConfigTest(test_env, "Single external client")
def test_single_external_client_only(self):
# Single external client only
test = ClientConfigTest(self)
test.create_external_lib_path(CURRENT_VERSION)
test.disable_local_client = True
test.exec()
def test_same_local_and_external_client(self, test_env):
test = ClientConfigTest(test_env, "Same Local & External Client")
def test_same_local_and_external_client(self):
# Same version local & external client
test = ClientConfigTest(self)
test.create_external_lib_path(CURRENT_VERSION)
test.exec()
def test_multiple_external_clients(self, test_env):
test = ClientConfigTest(test_env, "Multiple external clients")
def test_multiple_external_clients(self):
# Multiple external clients, normal case
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(PREV_PREV_RELEASE_VERSION)
test.exec()
def test_no_external_client_support_api_version(self, test_env):
test = ClientConfigTest(test_env, "Multiple external clients; API version supported by none")
def test_no_external_client_support_api_version(self):
# Multiple external clients, API version supported by none of them
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing
test.exec()
def test_no_external_client_support_api_version_ignore(self, test_env):
test = ClientConfigTest(test_env, "Multiple external clients; API version supported by none; Ignore failures")
def test_no_external_client_support_api_version_ignore(self):
# Multiple external clients; API version supported by none of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
@ -203,79 +204,66 @@ class ClientConfigTests:
test.expected_error = 2124 # All external clients failed
test.exec()
def test_one_external_client_wrong_api_version(self, test_env):
test = ClientConfigTest(test_env, "Multiple external clients: API version unsupported by one")
def test_one_external_client_wrong_api_version(self):
# Multiple external clients, API version unsupported by one of othem
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing
test.exec()
def test_one_external_client_wrong_api_version_ignore(self, test_env):
test = ClientConfigTest(test_env, "Multiple external clients; API version unsupported by one; Ignore failures")
def test_one_external_client_wrong_api_version_ignore(self):
# Multiple external clients; API version unsupported by one of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION)
test.ignore_external_client_failures = True
test.exec()
def test_prev_release_with_ext_client(self, test_env):
test = ClientConfigTest(test_env, "Cluster with previous release version")
# Client configuration tests using a cluster of previous release version
class ClientConfigPrevVersionTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.cluster = TestCluster(PREV_RELEASE_VERSION)
cls.cluster.setup()
@classmethod
def tearDownClass(cls):
cls.cluster.tearDown()
def test_external_client(self):
# Using an external client to connect
test = ClientConfigTest(self)
test.create_external_lib_path(PREV_RELEASE_VERSION)
test.api_version = api_version_from_str(PREV_RELEASE_VERSION)
test.exec()
def test_prev_release_with_ext_client_unsupported_api(self, test_env):
test = ClientConfigTest(test_env, "Cluster with previous release version; Unsupported API version")
def test_prev_release_with_ext_client_unsupported_api(self):
# Leaving an unsupported API version
test = ClientConfigTest(self)
test.create_external_lib_path(PREV_RELEASE_VERSION)
test.expected_error = 2204 # API function missing
test.exec()
def test_prev_release_with_ext_client_unsupported_api_ignore(self, test_env):
test = ClientConfigTest(
test_env, "Cluster with previous release version; Unsupported API version; Ignore failures"
)
def test_prev_release_with_ext_client_unsupported_api_ignore(self):
# Leaving an unsupported API version, ignore failures
test = ClientConfigTest(self)
test.create_external_lib_path(PREV_RELEASE_VERSION)
test.transaction_timeout = 100
test.expected_error = 1031 # Timeout
test.ignore_external_client_failures = True
test.exec()
def run_tests(self):
failed_cnt = 0
with TestEnv(self.args, self.downloader, CURRENT_VERSION) as test_env:
self.test_local_client_only(test_env)
self.test_single_external_client_only(test_env)
self.test_same_local_and_external_client(test_env)
self.test_multiple_external_clients(test_env)
self.test_no_external_client_support_api_version(test_env)
self.test_no_external_client_support_api_version_ignore(test_env)
self.test_one_external_client_wrong_api_version(test_env)
self.test_one_external_client_wrong_api_version_ignore(test_env)
failed_cnt += test_env.failed_cnt
if self.platform == "x86_64":
with TestEnv(self.args, self.downloader, PREV_RELEASE_VERSION) as test_env:
self.test_prev_release_with_ext_client(test_env)
self.test_prev_release_with_ext_client_unsupported_api(test_env)
self.test_prev_release_with_ext_client_unsupported_api_ignore(test_env)
failed_cnt += test_env.failed_cnt
if failed_cnt > 0:
print("{} tests failed".format(failed_cnt))
else:
print("All tests successful")
return failed_cnt
if __name__ == "__main__":
parser = ArgumentParser(
formatter_class=RawDescriptionHelpFormatter,
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""
A script for testing FDB multi-version client in upgrade scenarios. Creates a local cluster,
generates a workload using fdb_c_api_tester with a specified test file, and performs
cluster upgrade according to the specified upgrade path. Checks if the workload successfully
progresses after each upgrade step.
Unit tests for running FDB client with different configurations.
Also accepts python unit tests command line arguments.
""",
)
parser.add_argument(
@ -291,7 +279,13 @@ if __name__ == "__main__":
help="Path to the fdb_c_client_config_tester executable.",
required=True,
)
parser.add_argument("unittest_args", nargs=argparse.REMAINDER)
args = parser.parse_args()
test = ClientConfigTests(args)
failed_cnt = test.run_tests()
sys.exit(failed_cnt)
sys.argv[1:] = args.unittest_args
downloader = FdbBinaryDownloader(args.build_dir)
downloader.download_old_binaries(PREV_RELEASE_VERSION)
downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION)
unittest.main(verbosity=2)

View File

@ -0,0 +1,47 @@
/*
* BlobRestoreCommand.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/FDBOptions.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace fdb_cli {
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens) {
if (tokens.size() != 1 && tokens.size() != 2) {
printUsage(tokens[0]);
return false;
}
state bool success = false;
wait(store(success, localDb->blobRestore(normalKeys)));
if (success) {
fmt::print("Started blob restore for the full cluster. Please use 'status' command to check progress.\n");
} else {
fmt::print("Fail to start a new blob restore while there is a pending one.\n");
}
return success;
}
CommandFactory blobRestoreFactory("blobrestore", CommandHelp("blobrestore", "", ""));
} // namespace fdb_cli

View File

@ -1416,6 +1416,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
continue;
}
if (tokencmp(tokens[0], "blobrestore")) {
bool _result = wait(makeInterruptable(blobRestoreCommandActor(localDb, tokens)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "unlock")) {
if ((tokens.size() != 2) || (tokens[1].size() != 32) ||
!std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) {

View File

@ -213,6 +213,9 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
ACTOR Future<bool> blobKeyCommandActor(Database localDb,
Optional<TenantMapEntry> tenantEntry,
std::vector<StringRef> tokens);
// blobrestore command
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens);
// maintenance command
ACTOR Future<bool> setHealthyZone(Reference<IDatabase> db, StringRef zoneId, double seconds, bool printWarning = false);
ACTOR Future<bool> clearHealthyZone(Reference<IDatabase> db,

View File

@ -45,7 +45,12 @@ def run_fdbcli_command(*args):
string: Console output from fdbcli
"""
commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env).stdout.decode('utf-8').strip()
try:
# if the fdbcli command is stuck for more than 20 seconds, the database is definitely unavailable
process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
return process.stdout.decode('utf-8').strip()
except subprocess.TimeoutExpired:
raise Exception('The fdbcli command is stuck, database is unavailable')
def run_fdbcli_command_and_get_error(*args):
@ -1079,16 +1084,19 @@ if __name__ == '__main__':
lockAndUnlock()
maintenance()
profile()
suspend()
# TODO: reenable it until it's stable
# suspend()
transaction()
throttle()
# this is replaced by the "quota" command
#throttle()
triggerddteaminfolog()
tenants()
versionepoch()
integer_options()
tls_address_suffix()
knobmanagement()
quota()
# TODO: fix the issue when running through the external client
#quota()
else:
assert args.process_number > 1, "Process number should be positive"
coordinators()

View File

@ -971,6 +971,11 @@ void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion,
// clearVersion as previous guy)
}
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange) {
SortedDeltasT deltasByKey;
sortDeltasByKey(deltasByVersion, fileRange, deltasByKey);
}
// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking
Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
const Standalone<GranuleDeltas>& deltas,

View File

@ -5924,7 +5924,6 @@ public:
printf("Restoring backup to version: %lld\n", (long long)targetVersion);
}
state int retryCount = 0;
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop {
try {
@ -5948,17 +5947,9 @@ public:
wait(tr->commit());
break;
} catch (Error& e) {
if (e.code() == error_code_transaction_too_old) {
retryCount++;
}
if (e.code() == error_code_restore_duplicate_tag) {
throw;
}
if (g_network->isSimulated() && retryCount > 50) {
CODE_PROBE(true, "submitRestore simulation speedup");
// try to make the read window back to normal size (5 * version_per_sec)
g_simulator->speedUpSimulation = true;
}
wait(tr->onError(e));
}
}

View File

@ -2559,15 +2559,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
}
}
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName);
auto key = storageQuotaKey(tenantGroupName);
tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
}
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
void clearStorageQuota(Transaction& tr, StringRef tenantGroupName) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantGroupName);
tr.clear(key);
}
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantGroupName)));
if (!v.present()) {
return Optional<int64_t>();
}

View File

@ -4524,9 +4524,11 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
output.readToBegin = readToBegin;
output.readThroughEnd = readThroughEnd;
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows) &&
(!std::is_same<GetKeyValuesFamilyRequest, GetMappedKeyValuesRequest>::value)) {
// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
// happens in simulation so it's fine
// disable it on prefetch, because boundary entries serve as continuations
RangeResultFamily copy;
int newSize =
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
@ -10915,6 +10917,37 @@ Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
}
ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange range) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr->get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (status.progress < 100) {
return false; // stop if there is in-progress restore.
}
}
Standalone<BlobRestoreStatus> status;
status.progress = 0;
Value newValue = blobRestoreCommandValueFor(status);
tr->set(key, newValue);
wait(tr->commit());
return true;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
Future<bool> DatabaseContext::blobRestore(KeyRange range) {
return blobRestoreActor(Reference<DatabaseContext>::addRef(this), range);
}
int64_t getMaxKeySize(KeyRef const& key) {
return getMaxWriteKeySize(key, true);
}

View File

@ -296,7 +296,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip();
init( DD_TENANT_AWARENESS_ENABLED, false );
init( STORAGE_QUOTA_ENABLED, false ); if(isSimulated) STORAGE_QUOTA_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -387,7 +388,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_BACKGROUND_PARALLELISM, 4 );
init( ROCKSDB_READ_PARALLELISM, 4 );
// If true, do not process and store RocksDB logs
init( ROCKSDB_MUTE_LOGS, false );
init( ROCKSDB_MUTE_LOGS, true );
// Use a smaller memtable in simulation to avoid OOMs.
int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
init( ROCKSDB_MEMTABLE_BYTES, memtableBytes );
@ -809,18 +810,24 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip();
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );
init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( STRICTLY_ENFORCE_BYTE_LIMIT, false); if( randomize && BUGGIFY ) STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
init( FRACTION_INDEX_BYTELIMIT_PREFETCH, 0.2); if( randomize && BUGGIFY ) FRACTION_INDEX_BYTELIMIT_PREFETCH = 0.01 + deterministicRandom()->random01();
init( MAX_PARALLEL_QUICK_GET_VALUE, 10 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
// Read priority definitions in the form of a list of their relative concurrency share weights
init( STORAGESERVER_READ_PRIORITIES, "120,10,20,40,60" );
// The total concurrency which will be shared by active priorities according to their relative weights
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
// The priority number which each ReadType maps to in enumeration order
// This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
// The enumeration is currently: eager, fetch, low, normal, high
init( STORAGESERVER_READTYPE_PRIORITY_MAP, "0,1,2,3,4" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -944,7 +951,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_IO_PRIORITIES, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement

View File

@ -1660,11 +1660,41 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) {
return interf;
}
const KeyRangeRef blobRestoreCommandKeys("\xff\x02/blobRestoreCommand/"_sr, "\xff\x02/blobRestoreCommand0"_sr);
const Value blobRestoreCommandKeyFor(const KeyRangeRef range) {
BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule()));
wr.serializeBytes(blobRestoreCommandKeys.begin);
wr << range;
return wr.toValue();
}
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key) {
KeyRange range;
BinaryReader reader(key.removePrefix(blobRestoreCommandKeys.begin),
AssumeVersion(ProtocolVersion::withBlobGranule()));
reader >> range;
return range;
}
const Value blobRestoreCommandValueFor(BlobRestoreStatus status) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << status;
return wr.toValue();
}
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value) {
Standalone<BlobRestoreStatus> status;
BinaryReader reader(value, IncludeVersion());
reader >> status;
return status;
}
const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr);
const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;
Key storageQuotaKey(StringRef tenantName) {
return tenantName.withPrefix(storageQuotaPrefix);
Key storageQuotaKey(StringRef tenantGroupName) {
return tenantGroupName.withPrefix(storageQuotaPrefix);
}
const KeyRangeRef idempotencyIdKeys("\xff\x02/idmp/"_sr, "\xff\x02/idmp0"_sr);

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbrpc/Msgpack.h"
#include "fdbclient/Tracing.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
@ -79,41 +80,6 @@ struct LogfileTracer : ITracer {
}
};
struct TraceRequest {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "TracingSpanResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
// A server listening for UDP trace messages, run only in simulation.
ACTOR Future<Void> simulationStartServer() {
// We're going to force the address to be loopback regardless of FLOW_KNOBS->TRACING_UDP_LISTENER_ADDR
@ -167,146 +133,89 @@ ACTOR Future<Void> traceLog(int* pendingMessages, bool* sendError) {
struct UDPTracer : public ITracer {
// Serializes span fields as an array into the supplied TraceRequest
// buffer.
void serialize_span(const Span& span, TraceRequest& request) {
void serialize_span(const Span& span, MsgpackBuffer& buf) {
uint16_t size = 12;
request.write_byte(size | 0b10010000); // write as array
serialize_value(span.context.traceID.first(), request, 0xcf); // trace id
serialize_value(span.context.traceID.second(), request, 0xcf); // trace id
serialize_value(span.context.spanID, request, 0xcf); // spanid
buf.write_byte(size | 0b10010000); // write as array
serialize_value(span.context.traceID.first(), buf, 0xcf); // trace id
serialize_value(span.context.traceID.second(), buf, 0xcf); // trace id
serialize_value(span.context.spanID, buf, 0xcf); // spanid
// parent span id
serialize_value(span.parentContext.spanID, request, 0xcf); // spanId
serialize_value(span.parentContext.spanID, buf, 0xcf); // spanId
// Payload
serialize_string(span.location.name.toString(), request);
serialize_value(span.begin, request, 0xcb); // start time
serialize_value(span.end, request, 0xcb); // end
serialize_string(span.location.name.toString(), buf);
serialize_value(span.begin, buf, 0xcb); // start time
serialize_value(span.end, buf, 0xcb); // end
// Kind
serialize_value(span.kind, request, 0xcc);
serialize_value(span.kind, buf, 0xcc);
// Status
serialize_value(span.status, request, 0xcc);
serialize_value(span.status, buf, 0xcc);
// Links
serialize_vector(span.links, request);
serialize_vector(span.links, buf);
// Events
serialize_vector(span.events, request);
serialize_vector(span.events, buf);
// Attributes
serialize_map(span.attributes, request);
serialize_map(span.attributes, buf);
}
private:
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, TraceRequest& request, uint8_t msgpack_type) {
request.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
request.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, TraceRequest& request) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
request.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
request.write_byte(0xd9);
request.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
request.write_byte(0xda);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeString")
.detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
request.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, TraceRequest& request) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), request);
}
// Writes the given vector of linked SpanContext's to the request. If the vector is
// empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, MsgpackBuffer& buf) {
int size = vec.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
request.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& link : vec) {
serialize_value(link.traceID.first(), request, 0xcf); // trace id
serialize_value(link.traceID.second(), request, 0xcf); // trace id
serialize_value(link.spanID, request, 0xcf); // spanid
serialize_value(link.traceID.first(), buf, 0xcf); // trace id
serialize_value(link.traceID.second(), buf, 0xcf); // trace id
serialize_value(link.spanID, buf, 0xcf); // spanid
}
}
// Writes the given vector of linked SpanContext's to the request. If the vector is
// Writes the given vector of linked SpanEventRef's to the request. If the vector is
// empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, MsgpackBuffer& buf) {
int size = vec.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
request.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& event : vec) {
serialize_string(event.name.toString(), request); // event name
serialize_value(event.time, request, 0xcb); // event time
serialize_vector(event.attributes, request);
serialize_string(event.name.toString(), buf); // event name
serialize_value(event.time, buf, 0xcb); // event time
serialize_vector(event.attributes, buf);
}
}
inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, MsgpackBuffer& buf) {
int size = vals.size();
if (size <= 15) {
// N.B. We're actually writing this out as a fixmap here in messagepack format!
// fixmap 1000xxxx 0x80 - 0x8f
request.write_byte(static_cast<uint8_t>(size) | 0b10000000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& kv : vals) {
serialize_string(kv.key.toString(), request);
serialize_string(kv.value.toString(), request);
}
}
template <class Map>
inline void serialize_map(const Map& map, TraceRequest& request) {
int size = map.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), request);
serialize_string(value.begin(), value.size(), request);
serialize_string(kv.key.toString(), buf);
serialize_string(kv.value.toString(), buf);
}
}
};
@ -336,9 +245,9 @@ ACTOR Future<Void> fastTraceLogger(int* unreadyMessages, int* failedMessages, in
struct FastUDPTracer : public UDPTracer {
FastUDPTracer()
: unready_socket_messages_(0), failed_messages_(0), total_messages_(0), socket_fd_(-1), send_error_(false) {
request_ = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
request_ = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
}
TracerType type() const override { return TracerType::NETWORK_LOSSY; }
@ -394,7 +303,7 @@ struct FastUDPTracer : public UDPTracer {
}
private:
TraceRequest request_;
MsgpackBuffer request_;
int unready_socket_messages_;
int failed_messages_;
@ -657,9 +566,9 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled",
KnobValueRef::create(bool{ true }));
Span span1("encoded_span"_loc);
auto request = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
auto request = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
auto tracer = FastUDPTracer();
tracer.serialize_span(span1, request);
auto data = request.buffer.get();

View File

@ -313,4 +313,15 @@ struct BlobManifest {
}
};
// Defines blob restore status
struct BlobRestoreStatus {
constexpr static FileIdentifier file_identifier = 378657;
int progress;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, progress);
}
};
#endif

View File

@ -56,4 +56,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
#endif
// For benchmark testing only. It should never be called in prod.
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);
#endif

View File

@ -403,6 +403,7 @@ public:
Future<Version> verifyBlobRange(const KeyRange& range,
Optional<Version> version,
Optional<TenantName> tenantName = {});
Future<bool> blobRestore(const KeyRange range);
// private:
explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,

View File

@ -163,9 +163,10 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
// Set/clear/get the storage quota for the given tenant group
void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota);
void clearStorageQuota(Transaction& tr, StringRef tenantGroupName);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName);
#include "flow/unactorcompiler.h"
#endif

View File

@ -237,6 +237,8 @@ public:
int64_t
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
bool STORAGE_QUOTA_ENABLED; // Whether storage quota enforcement for tenant groups and all the relevant storage
// usage / quota monitors are enabled.
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
// in the TenantCache
@ -761,14 +763,16 @@ public:
bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK;
bool STRICTLY_ENFORCE_BYTE_LIMIT;
double FRACTION_INDEX_BYTELIMIT_PREFETCH;
int MAX_PARALLEL_QUICK_GET_VALUE;
int CHECKPOINT_TRANSFER_BLOCK_BYTES;
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -917,7 +921,7 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
std::string REDWOOD_IO_PRIORITIES;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;

View File

@ -710,11 +710,18 @@ UID decodeBlobWorkerListKey(KeyRef const& key);
const Value blobWorkerListValue(BlobWorkerInterface const& interface);
BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value);
// Blob restore command
extern const KeyRangeRef blobRestoreCommandKeys;
const Value blobRestoreCommandKeyFor(const KeyRangeRef range);
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key);
const Value blobRestoreCommandValueFor(BlobRestoreStatus status);
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value);
// Storage quota per tenant
// "\xff/storageQuota/[[tenantName]]" := "[[quota]]"
// "\xff/storageQuota/[[tenantGroupName]]" := "[[quota]]"
extern const KeyRangeRef storageQuotaKeys;
extern const KeyRef storageQuotaPrefix;
Key storageQuotaKey(StringRef tenantName);
Key storageQuotaKey(StringRef tenantGroupName);
extern const KeyRangeRef idempotencyIdKeys;
extern const KeyRef idempotencyIdsExpiredVersion;

View File

@ -0,0 +1,157 @@
/*
* Msgpack.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBRPC_MSGPACK_H
#define FDBRPC_MSGPACK_H
#include <limits>
#pragma once
#include <memory>
#include <algorithm>
#include "flow/Trace.h"
#include "flow/Error.h"
#include "flow/network.h"
struct MsgpackBuffer {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
// This assumes that pos <= data_size
void edit_byte(uint8_t byte, size_t pos) { buffer[pos] = byte; }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "MsgpackResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
inline void serialize_bool(bool val, MsgpackBuffer& buf) {
if (val) {
buf.write_byte(0xc3);
} else {
buf.write_byte(0xc2);
}
}
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, MsgpackBuffer& buf, uint8_t msgpack_type) {
buf.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
buf.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, MsgpackBuffer& buf) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
buf.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
buf.write_byte(0xd9);
buf.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
buf.write_byte(0xda);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "MsgpackSerializeString").detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
buf.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, MsgpackBuffer& buf) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), buf);
}
template <typename T, typename F>
inline void serialize_vector(const std::vector<T>& vec, MsgpackBuffer& buf, F f) {
size_t size = vec.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else if (size <= std::numeric_limits<uint32_t>::max()) {
buf.write_byte(0xdd);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[3]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[2]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "MsgPackSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
// Use the provided serializer function to serialize the individual types of the vector
for (const auto& val : vec) {
f(val, buf);
}
}
template <class Map>
inline void serialize_map(const Map& map, MsgpackBuffer& buf) {
int size = map.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "MsgPackSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), buf);
serialize_string(value.begin(), value.size(), buf);
}
}
#endif

View File

@ -20,6 +20,7 @@
#ifndef FDBRPC_TIMED_REQUEST_H
#define FDBRPC_TIMED_REQUEST_H
#include "flow/network.h"
#pragma once
#include <fdbrpc/fdbrpc.h>
@ -35,7 +36,7 @@ public:
TimedRequest() {
if (!FlowTransport::isClient()) {
_requestTime = timer();
_requestTime = g_network->timer();
} else {
_requestTime = 0.0;
}

View File

@ -388,6 +388,8 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
Promise<Void> iAmReplaced;
bool isFullRestoreMode = false;
BlobManagerData(UID id,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
Database db,
@ -3537,7 +3539,10 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
bmData->startRecruiting.trigger();
bmData->initBStore();
if (isFullRestoreMode()) {
bool isFullRestore = wait(isFullRestoreMode(bmData->db, normalKeys));
bmData->isFullRestoreMode = isFullRestore;
if (bmData->isFullRestoreMode) {
wait(loadManifest(bmData->db, bmData->bstore));
int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore));
@ -5297,11 +5302,8 @@ ACTOR Future<Void> backupManifest(Reference<BlobManagerData> bmData) {
bmData->initBStore();
loop {
bool pendingSplit = wait(hasPendingSplit(bmData));
if (!pendingSplit) {
wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
bmData->manifestDumperSeqNo++;
}
wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
bmData->manifestDumperSeqNo++;
wait(delay(SERVER_KNOBS->BLOB_MANIFEST_BACKUP_INTERVAL));
}
}
@ -5370,7 +5372,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
if (SERVER_KNOBS->BG_ENABLE_MERGING) {
self->addActor.send(granuleMergeChecker(self));
}
if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !isFullRestoreMode()) {
if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !self->isFullRestoreMode) {
self->addActor.send(backupManifest(self));
}

View File

@ -60,7 +60,7 @@ struct BlobManifestFile {
int64_t seqNo{ 0 };
BlobManifestFile(const std::string& path) {
if (sscanf(path.c_str(), MANIFEST_FOLDER "/manifest.%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) {
if (sscanf(path.c_str(), MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) {
fileName = path;
}
}
@ -76,7 +76,7 @@ struct BlobManifestFile {
BlobManifestFile file(path);
return file.epoch > 0 && file.seqNo > 0;
};
BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER, filter));
BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER "/", filter));
std::vector<BlobManifestFile> result;
for (auto& f : filesAndSizes) {
@ -107,6 +107,9 @@ public:
try {
state Standalone<BlobManifest> manifest;
Standalone<VectorRef<KeyValueRef>> rows = wait(getSystemKeys(self));
if (rows.size() == 0) {
return Void();
}
manifest.rows = rows;
Value data = encode(manifest);
wait(writeToFile(self, data));
@ -153,7 +156,8 @@ private:
state std::string fullPath;
std::tie(writer, fullPath) = self->blobConn_->createForWrite(MANIFEST_FOLDER);
state std::string fileName = format(MANIFEST_FOLDER "/manifest.%lld.%lld", self->epoch_, self->seqNo_);
state std::string fileName =
format(MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%lld.%lld", self->epoch_, self->seqNo_);
state Reference<IBackupFile> file = wait(writer->writeFile(fileName));
wait(file->append(data.begin(), data.size()));
wait(file->finish());
@ -453,3 +457,26 @@ ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider
int64_t epoc = wait(BlobManifestLoader::lastBlobEpoc(loader));
return epoc;
}
// Return true if the given key range is restoring
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
state Transaction tr(db);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
RangeResult ranges = wait(tr.getRange(blobRestoreCommandKeys, CLIENT_KNOBS->TOO_MANY));
for (auto& r : ranges) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
}
}
return false;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}

View File

@ -21,6 +21,7 @@
#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/BlobConnectionProvider.h"
@ -63,14 +64,7 @@ public:
// Start migration
ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
if (!isFullRestoreMode()) {
return Void();
}
wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
self->blobGranules_ = granules;
wait(checkIfReadyForMigration(self));
wait(prepare(self, normalKeys));
wait(advanceVersion(self));
wait(serverLoop(self));
@ -78,6 +72,28 @@ public:
}
private:
// Check if blob manifest is loaded so that blob migration can start
ACTOR static Future<Void> checkIfReadyForMigration(Reference<BlobMigrator> self) {
loop {
bool isFullRestore = wait(isFullRestoreMode(self->db_, normalKeys));
if (isFullRestore) {
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
if (!granules.empty()) {
self->blobGranules_ = granules;
for (BlobGranuleRestoreVersion granule : granules) {
TraceEvent("RestorableGranule")
.detail("GranuleId", granule.granuleID.toString())
.detail("KeyRange", granule.keyRange.toString())
.detail("Version", granule.version)
.detail("SizeInBytes", granule.sizeInBytes);
}
return Void();
}
}
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
// Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after
@ -136,8 +152,9 @@ private:
}
}
if (owning) {
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
TraceEvent("UnassignKeys").detail("Keys", keys.toString()).detail("From", id.toString());
}
}
wait(tr.commit());
@ -185,8 +202,10 @@ private:
// Calculated progress
int64_t total = sizeInBytes(self);
int progress = (total - incompleted) * 100 / total;
bool done = incompleted == 0;
dprint("Progress {} :{}%. done {}\n", serverID.toString(), progress, done);
state bool done = incompleted == 0;
dprint("Migration progress :{}%. done {}\n", progress, done);
TraceEvent("BlobMigratorProgress").detail("Progress", progress).detail("Done", done);
wait(updateProgress(self, normalKeys, progress));
return done;
} catch (Error& e) {
wait(tr.onError(e));
@ -194,6 +213,32 @@ private:
}
}
// Update restore progress
ACTOR static Future<Void> updateProgress(Reference<BlobMigrator> self, KeyRangeRef range, int progress) {
state Transaction tr(self->db_);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr.get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (progress > status.progress) {
status.progress = progress;
Value updatedValue = blobRestoreCommandValueFor(status);
tr.set(key, updatedValue);
wait(tr.commit());
}
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
@ -207,6 +252,7 @@ private:
if (currentVersion <= expectedVersion) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned()));
dprint("Advance version from {} to {}\n", currentVersion, expectedVersion);
TraceEvent("AdvanceVersion").detail("Current", currentVersion).detail("New", expectedVersion);
wait(tr.commit());
}
return Void();
@ -218,7 +264,7 @@ private:
// Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(waitFailureServer(self->interf_.waitFailure.getFuture()));
self->actors_.add(logProgress(self));
self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self));
@ -226,6 +272,7 @@ private:
try {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
dprint("Stopping blob migrator {}\n", self->interf_.id().toString());
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
break;
@ -237,6 +284,8 @@ private:
throw;
}
}
self->actors_.clear(true);
dprint("Stopped blob migrator {}\n", self->interf_.id().toString());
return Void();
}
@ -267,7 +316,7 @@ private:
req.reply.send(rep);
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
fmt::print("Handle GetStorageMetrics\n");
// fmt::print("Handle GetStorageMetrics\n");
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self);
GetStorageMetricsReply resp;
@ -331,7 +380,7 @@ private:
req.reply.sendError(unsupported_operation());
}
when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
dprint("Unsupported UpdateCommitCostRequest\n");
// dprint("Unsupported UpdateCommitCostRequest\n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
@ -358,9 +407,9 @@ private:
}
ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
// dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
req.reply.sendError(unsupported_operation());
return Void();
@ -398,7 +447,8 @@ private:
// Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", interf.id().toString());
TraceEvent("StartBlobMigrator").detail("Interface", interf.id().toString());
dprint("Starting blob migrator {}\n", interf.id().toString());
try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
wait(BlobMigrator::start(self));

View File

@ -292,6 +292,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int64_t lastResidentMemory = 0;
double lastResidentMemoryCheckTime = -100.0;
bool isFullRestoreMode = false;
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), tenantData(BGTenantMap(dbInfo)), dbInfo(dbInfo),
initialSnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM)),
@ -2146,7 +2148,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
// No need to start Change Feed in full restore mode
if (isFullRestoreMode())
if (bwData->isFullRestoreMode)
return Void();
checkMergeCandidate = granuleCheckMergeCandidate(bwData,
@ -3588,7 +3590,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state Reference<GranuleMetadata> metadata = m;
// state Version granuleBeginVersion = req.beginVersion;
// skip waiting for CF ready for recovery mode
if (!isFullRestoreMode()) {
if (!bwData->isFullRestoreMode) {
choose {
when(wait(metadata->readable.getFuture())) {}
when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); }
@ -3646,7 +3648,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// this is an active granule query
loop {
// skip check since CF doesn't start for bare metal recovery mode
if (isFullRestoreMode()) {
if (bwData->isFullRestoreMode) {
break;
}
if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) {
@ -3689,7 +3691,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it
// got popped, but we can still reject the in theory this should never happen with other protections but
// it's a useful and inexpensive sanity check
if (!isFullRestoreMode()) {
if (!bwData->isFullRestoreMode) {
Version emptyVersion = metadata->activeCFData.get()->popVersion - 1;
if (req.readVersion > metadata->durableDeltaVersion.get() &&
emptyVersion > metadata->bufferedDeltaVersion) {
@ -3995,6 +3997,9 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
throw granule_assignment_conflict();
}
bool isFullRestore = wait(isFullRestoreMode(bwData->db, req.keyRange));
bwData->isFullRestoreMode = isFullRestore;
Optional<Value> prevLockValue = wait(fLockValue);
state bool hasPrevOwner = prevLockValue.present();
state bool createChangeFeed = false;
@ -4069,7 +4074,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
// for recovery mode - don't create change feed, don't create snapshot
if (isFullRestoreMode()) {
if (bwData->isFullRestoreMode) {
createChangeFeed = false;
info.doSnapshot = false;
GranuleFiles granuleFiles = wait(loadPreviousFiles(&tr, info.granuleID));
@ -4091,7 +4096,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
}
if (createChangeFeed && !isFullRestoreMode()) {
if (createChangeFeed && !bwData->isFullRestoreMode) {
// create new change feed for new version of granule
wait(updateChangeFeed(
&tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
@ -4103,7 +4108,8 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// If anything in previousGranules, need to do the handoff logic and set
// ret.previousChangeFeedId, and the previous durable version will come from the previous
// granules
if (info.history.present() && info.history.get().value.parentVersions.size() > 0 && !isFullRestoreMode()) {
if (info.history.present() && info.history.get().value.parentVersions.size() > 0 &&
!bwData->isFullRestoreMode) {
CODE_PROBE(true, "Granule open found parent");
if (info.history.get().value.parentVersions.size() == 1) { // split
state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],

View File

@ -23,6 +23,7 @@
#include <map>
#include <memory>
#include <set>
#include <tuple>
#include <vector>
#include "fdbclient/FDBTypes.h"
@ -691,7 +692,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
WorkerDetails newMGWorker;
if (self->db.blobGranulesEnabled.get()) {
newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
newMGWorker = findNewProcessForSingleton(self, ProcessClass::BlobMigrator, id_used);
}
}
@ -710,7 +711,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ProcessClass::Fitness bestFitnessForMG;
if (self->db.blobGranulesEnabled.get()) {
bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
bestFitnessForMG = findBestFitnessForSingleton(self, newMGWorker, ProcessClass::BlobManager);
}
}
@ -744,7 +745,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
bmHealthy = isHealthySingleton<BlobManagerInterface>(
self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
mgHealthy = isHealthySingleton<BlobMigratorInterface>(
self, newMGWorker, mgSingleton, bestFitnessForMG, self->recruitingBlobMigratorID);
}
@ -775,7 +776,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
currBMProcessId = bmSingleton.interface.get().locality.processId();
newBMProcessId = newBMWorker.interf.locality.processId();
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
currMGProcessId = mgSingleton.interface.get().locality.processId();
newMGProcessId = newMGWorker.interf.locality.processId();
}
@ -792,7 +793,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
currPids.emplace_back(currBMProcessId);
newPids.emplace_back(newBMProcessId);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
currPids.emplace_back(currMGProcessId);
newPids.emplace_back(newMGProcessId);
}
@ -810,7 +811,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (!self->db.blobGranulesEnabled.get()) {
ASSERT(currColocMap[currBMProcessId] == 0);
ASSERT(newColocMap[newBMProcessId] == 0);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
ASSERT(currColocMap[currMGProcessId] == 0);
ASSERT(newColocMap[newMGProcessId] == 0);
}
@ -836,7 +837,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ddSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) {
bmSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() &&
} else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get() &&
newColocMap[newMGProcessId] < currColocMap[currMGProcessId]) {
mgSingleton.recruit(self);
} else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) {
@ -1404,13 +1405,13 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID);
}
if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() && req.blobManagerInterf.present()) {
if (self->db.blobGranulesEnabled.get() && req.blobManagerInterf.present()) {
auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager);
auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf);
haltRegisteringOrCurrentSingleton<BlobManagerInterface>(
self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
}
if (req.blobMigratorInterf.present()) {
if (req.blobMigratorInterf.present() && self->db.blobRestoreEnabled.get()) {
auto currSingleton = BlobMigratorSingleton(self->db.serverInfo->get().blobMigrator);
auto registeringSingleton = BlobMigratorSingleton(req.blobMigratorInterf);
haltRegisteringOrCurrentSingleton<BlobMigratorInterface>(
@ -2553,6 +2554,43 @@ ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
}
}
ACTOR Future<Void> watchBlobRestoreCommand(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
state Key blobRestoreCommandKey = blobRestoreCommandKeyFor(normalKeys);
loop {
try {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> blobRestoreCommand = wait(tr->get(blobRestoreCommandKey));
if (blobRestoreCommand.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(blobRestoreCommand.get());
TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress);
if (status.progress == 0) {
self->db.blobRestoreEnabled.set(true);
if (self->db.blobGranulesEnabled.get()) {
const auto& blobManager = self->db.serverInfo->get().blobManager;
if (blobManager.present()) {
BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId());
}
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
if (blobMigrator.present()) {
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
}
}
}
state Future<Void> watch = tr->watch(blobRestoreCommandKey);
wait(tr->commit());
wait(watch);
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> startBlobMigrator(ClusterControllerData* self, double waitTime) {
// If master fails at the same time, give it a chance to clear master PID.
// Also wait to avoid too many consecutive recruits in a small time window.
@ -2629,9 +2667,8 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
}
loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient =
waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop {
choose {
when(wait(wfClient)) {
@ -2643,11 +2680,11 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
when(wait(self->recruitBlobMigrator.onChange())) {}
}
}
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode()) {
} else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get()) {
// if there is no blob migrator present but blob granules are now enabled, recruit a BM
wait(startBlobMigrator(self, recruitThrottler.newRecruitment()));
} else {
wait(self->db.blobGranulesEnabled.onChange());
wait(self->db.blobGranulesEnabled.onChange() || self->db.blobRestoreEnabled.onChange());
}
}
}
@ -2778,7 +2815,7 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
const auto& blobManager = self->db.serverInfo->get().blobManager;
BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId());
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
@ -3079,8 +3116,9 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self));
self.addActor.send(monitorBlobManager(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobGranulesConfigKey(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobRestoreCommand(&self));
self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self));

View File

@ -414,7 +414,8 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
}
Optional<TenantNameRef> const& tenantName = req.tenantInfo.name;
if (tenantName.present() && commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED && tenantName.present() &&
commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
req.reply.sendError(storage_quota_exceeded());
continue;
}
@ -829,7 +830,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) {
// Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries.
CODE_PROBE(true, "Reject transactions in the batch");
TraceEvent(SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid)
TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid)
.suppressFor(0.1)
.detail("QDelay", queuingDelay)
.detail("Transactions", trs.size())
@ -2971,7 +2972,9 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
proxy.expireIdempotencyId,
commitData.expectedIdempotencyIdCountForKey,
&commitData.idempotencyClears));
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
}
// wait for txnStateStore recovery
wait(success(commitData.txnStateStore->readValue(StringRef())));

View File

@ -1423,6 +1423,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
state double startTime = now();
state std::vector<UID> destIds;
state uint64_t debugID = deterministicRandom()->randomUInt64();
state bool enableShardMove = SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD;
try {
if (now() - self->lastInterval < 1.0) {
@ -1539,8 +1540,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
req.src = rd.src;
req.completeSources = rd.completeSources;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 1) {
if (enableShardMove && tciIndex == 1) {
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
@ -1587,64 +1587,58 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
anyWithSource = true;
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
// a remote team
if (enableShardMove) {
if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false;
break;
}
}
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
if (!bestTeam.first.get()->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
break;
}
}
bestTeams.emplace_back(bestTeam.first.get(), true);
// Always set bestTeams[i].second = true to disable optimization in data move between DCs
// for the correctness of PhysicalShardCollection
// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
// never get changed.
if (tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
} else {
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
}
// get physicalShardIDCandidate
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
}
tciIndex++;
}
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
// In this case, we must re-select a remote team
// We set foundTeams = false to avoid finishing team selection
// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
// already
anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@ -1665,7 +1659,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size())
.detail("Servers", destServersString(bestTeams));
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (rd.isRestore() && destOverloadedCount > 50) {
throw data_move_dest_team_not_found();
}
@ -1689,14 +1683,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
// However, this may be failed
// Any retry triggers to use new physicalShard which enters the normal routine
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
forceToUseNewPhysicalShard = true;
}
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (!rd.isRestore()) {
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
@ -1954,7 +1948,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
self->shardsAffectedByTeamFailure->finishMove(rd.keys);
relocationComplete.send(rd);
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
// update physical shard collection
std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
for (int i = 0; i < bestTeams.size(); i++) {

View File

@ -588,7 +588,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<DDTeamCollection> primaryTeamCollection;
state Reference<DDTeamCollection> remoteTeamCollection;
state bool trackerCancelled;
state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED;
loop {
trackerCancelled = false;
self->initialized = Promise<Void>();
@ -610,7 +609,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
if (ddIsTenantAware) {
if (SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED || SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
self->ddTenantCache = makeReference<TenantCache>(cx, self->ddId);
wait(self->ddTenantCache.get()->build());
}
@ -684,6 +683,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
}
if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
actors.push_back(reportErrorsExcept(self->ddTenantCache.get()->monitorStorageQuota(),
"StorageQuotaTracker",
self->ddId,
@ -1320,7 +1321,7 @@ GetStorageWigglerStateReply getStorageWigglerStates(Reference<DataDistributor> s
TenantsOverStorageQuotaReply getTenantsOverStorageQuota(Reference<DataDistributor> self) {
TenantsOverStorageQuotaReply reply;
if (self->ddTenantCache.present()) {
if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
reply.tenants = self->ddTenantCache.get()->getTenantsOverQuota();
}
return reply;

View File

@ -446,11 +446,14 @@ void proxyGRVThresholdExceeded(const GetReadVersionRequest* req, GrvProxyStats*
++stats->txnRequestErrors;
req->reply.sendError(grv_proxy_memory_limit_exceeded());
if (req->priority == TransactionPriority::IMMEDIATE) {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededSystem").suppressFor(60);
TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededSystem")
.suppressFor(60);
} else if (req->priority == TransactionPriority::DEFAULT) {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededDefault").suppressFor(60);
TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededDefault")
.suppressFor(60);
} else {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededBatch").suppressFor(60);
TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededBatch")
.suppressFor(60);
}
}

View File

@ -58,6 +58,14 @@ void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBand
}
}
void GrvProxyTagThrottler::TagQueue::endReleaseWindow(int64_t numStarted, double elapsed) {
if (rateInfo.present()) {
CODE_PROBE(requests.empty(), "Tag queue ending release window with empty request queue");
CODE_PROBE(!requests.empty(), "Tag queue ending release window with requests still queued");
rateInfo.get().endReleaseWindow(numStarted, requests.empty(), elapsed);
}
}
GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration)
: maxThrottleDuration(maxThrottleDuration),
latencyBandsMap("GrvProxyTagThrottler",
@ -202,16 +210,14 @@ void GrvProxyTagThrottler::releaseTransactions(double elapsed,
}
}
// End release windows for queues with valid rateInfo
// End release windows for all tag queues
{
TransactionTagMap<uint32_t> transactionsReleasedMap;
for (const auto& [tag, count] : transactionsReleased) {
transactionsReleasedMap[tag] = count;
}
for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) {
queue.rateInfo.get().endReleaseWindow(transactionsReleasedMap[tag], false, elapsed);
}
queue.endReleaseWindow(transactionsReleasedMap[tag], elapsed);
}
}
// If the capacity is increased, that means the vector has been illegally resized, potentially
@ -438,3 +444,33 @@ TEST_CASE("/GrvProxyTagThrottler/Fifo") {
wait(mockFifoClient(&throttler));
return Void();
}
// Tests that while throughput is low, the tag throttler
// does not accumulate too much budget.
//
// A server is setup to server 10 transactions per second,
// then runs idly for 60 seconds. Then a client starts
// and attempts 20 transactions per second for 60 seconds.
// The server throttles the client to only achieve
// 10 transactions per second during this 60 second window.
// If the throttler is allowed to accumulate budget indefinitely
// during the idle 60 seconds, this test will fail.
TEST_CASE("/GrvProxyTagThrottler/LimitedIdleBudget") {
state GrvProxyTagThrottler throttler(5.0);
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> server = mockServer(&throttler);
wait(delay(60.0));
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 20.0, &counters);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_LimitedIdleBudget").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}

View File

@ -35,7 +35,7 @@ bool GrvTransactionRateInfo::canStart(int64_t numAlreadyStarted, int64_t count)
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that
// elapsed.
@ -52,16 +52,15 @@ void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool
//
// Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds,
// whereas "release window" is the period between wait statements, with duration indicated by "elapsed."
budget =
std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
budget = std::max(0.0, budget + elapsed * (limit - numStarted) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) {
if (queueEmpty) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
smoothReleased.addDelta(numStarted);
}
void GrvTransactionRateInfo::disable() {

View File

@ -391,9 +391,16 @@ struct Counters {
CounterCollection cc;
Counter immediateThrottle;
Counter failedToAcquire;
Counter deleteKeyReqs;
Counter deleteRangeReqs;
Counter convertedDeleteKeyReqs;
Counter convertedDeleteRangeReqs;
Counters()
: cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("FailedToAcquire", cc) {}
: cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("FailedToAcquire", cc),
deleteKeyReqs("DeleteKeyRequests", cc), deleteRangeReqs("DeleteRangeRequests", cc),
convertedDeleteKeyReqs("ConvertedDeleteKeyRequests", cc),
convertedDeleteRangeReqs("ConvertedDeleteRangeRequests", cc) {}
};
struct ReadIterator {
@ -1934,12 +1941,17 @@ struct RocksDBKeyValueStore : IKeyValueStore {
}
ASSERT(defaultFdbCF != nullptr);
// Number of deletes to rocksdb = counters.deleteKeyReqs + convertedDeleteKeyReqs;
// Number of deleteRanges to rocksdb = counters.deleteRangeReqs - counters.convertedDeleteRangeReqs;
if (keyRange.singleKeyRange()) {
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
++counters.deleteKeyReqs;
} else {
++counters.deleteRangeReqs;
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
storageMetrics->byteSample.getEstimate(keyRange) <
SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
++counters.convertedDeleteRangeReqs;
rocksdb::ReadOptions options = sharedState->getReadOptions();
auto beginSlice = toSlice(keyRange.begin);
auto endSlice = toSlice(keyRange.end);
@ -1949,6 +1961,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
cursor->Seek(toSlice(keyRange.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
writeBatch->Delete(defaultFdbCF, cursor->key());
++counters.convertedDeleteKeyReqs;
cursor->Next();
}
if (!cursor->status().ok()) {
@ -1958,6 +1971,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
auto it = keysSet.lower_bound(keyRange.begin);
while (it != keysSet.end() && *it < keyRange.end) {
writeBatch->Delete(defaultFdbCF, toSlice(*it));
++counters.convertedDeleteKeyReqs;
it++;
}
}

View File

@ -289,11 +289,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
// Detect conflicts
double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME;
ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena);
Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
newOldestVersion = req.version - std::max(5 * SERVER_KNOBS->VERSIONS_PER_SECOND,
SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS);
}
const Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
for (int t = 0; t < req.transactions.size(); t++) {
conflictBatch.addTransaction(req.transactions[t], newOldestVersion);
self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size();

View File

@ -422,11 +422,12 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
state LogMessageVersion msgVersion;
msgVersion.version = reader.consumeNetworkUInt64();
msgVersion.sub = reader.consumeNetworkUInt32();
int msgSize = reader.consumeNetworkInt32();
const uint8_t* message = reader.consume(msgSize);
state int msgSize = reader.consumeNetworkInt32();
state const uint8_t* message = reader.consume(msgSize);
// Skip mutations out of the version range
if (!asset.isInVersionRange(msgVersion.version)) {
wait(yield()); // avoid potential stack overflows
continue;
}

View File

@ -127,25 +127,38 @@ public:
loop {
state double fetchStartTime = now();
state std::vector<TenantName> tenants = tenantCache->getTenantList();
state std::vector<TenantGroupName> groups;
for (const auto& [group, storage] : tenantCache->tenantStorageMap) {
groups.push_back(group);
}
state int i;
for (i = 0; i < tenants.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
tenantCache->tenantStorageMap[tenants[i]].usage = size;
break;
} catch (Error& e) {
if (e.code() == error_code_tenant_not_found) {
tenantCache->tenantStorageMap.erase(tenants[i]);
for (i = 0; i < groups.size(); i++) {
state TenantGroupName group = groups[i];
state int64_t usage = 0;
// `tenants` needs to be a copy so that the erase (below) or inserts/erases from other
// functions (when this actor yields) do not interfere with the iteration
state std::unordered_set<TenantName> tenants = tenantCache->tenantStorageMap[group].tenants;
state std::unordered_set<TenantName>::iterator iter = tenants.begin();
for (; iter != tenants.end(); iter++) {
state TenantName tenant = *iter;
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenant);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
usage += size;
break;
} else {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
} catch (Error& e) {
if (e.code() == error_code_tenant_not_found) {
tenantCache->tenantStorageMap[group].tenants.erase(tenant);
break;
} else {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
}
}
}
tenantCache->tenantStorageMap[group].usage = usage;
}
lastTenantListFetchTime = now();
@ -162,22 +175,24 @@ public:
state Transaction tr(tenantCache->dbcx());
loop {
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
for (auto const kv : currentQuotas) {
TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[tenant].quota = quota;
}
tr.reset();
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
// Reset the quota for all groups; this essentially sets the quota to `max` for groups where the
// quota might have been cleared (i.e., groups that will not be returned in `getRange` request above).
for (auto& [group, storage] : tenantCache->tenantStorageMap) {
storage.quota = std::numeric_limits<int64_t>::max();
}
for (const auto kv : currentQuotas) {
const TenantGroupName group = kv.key.removePrefix(storageQuotaPrefix);
const int64_t quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[group].quota = quota;
}
tr.reset();
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
}
}
};
@ -189,6 +204,10 @@ void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
TenantInfo tenantInfo(tenantName, Optional<Standalone<StringRef>>(), tenant.id);
tenantCache[tenantPrefix] = makeReference<TCTenantInfo>(tenantInfo, tenant.prefix);
tenantCache[tenantPrefix]->updateCacheGeneration(generation);
if (tenant.tenantGroup.present()) {
tenantStorageMap[tenant.tenantGroup.get()].tenants.insert(tenantName);
}
}
void TenantCache::startRefresh() {
@ -289,13 +308,13 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
}
std::unordered_set<TenantName> TenantCache::getTenantsOverQuota() const {
std::unordered_set<TenantName> tenants;
for (const auto& [tenant, storage] : tenantStorageMap) {
std::unordered_set<TenantName> tenantsOverQuota;
for (const auto& [tenantGroup, storage] : tenantStorageMap) {
if (storage.usage > storage.quota) {
tenants.insert(tenant);
tenantsOverQuota.insert(storage.tenants.begin(), storage.tenants.end());
}
}
return tenants;
return tenantsOverQuota;
}
Future<Void> TenantCache::monitorTenantMap() {

View File

@ -2025,7 +2025,8 @@ public:
bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
: keyProvider(keyProvider),
ioLock(makeReference<PriorityMultiLock>(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_IO_PRIORITIES)),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2037,7 +2038,7 @@ public:
// This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock;
g_redwoodMetrics.ioLock = ioLock.getPtr();
if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger();
}
@ -2499,7 +2500,7 @@ public:
unsigned int level,
bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite;
g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason);
if (self->memoryOnly) {
@ -2779,7 +2780,7 @@ public:
int blockSize,
int64_t offset,
int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes;
@ -3593,7 +3594,7 @@ public:
// The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill();
self->ioLock->kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel();
@ -3802,7 +3803,7 @@ private:
Reference<IPageEncryptionKeyProvider> keyProvider;
PriorityMultiLock ioLock;
Reference<PriorityMultiLock> ioLock;
int64_t pageCacheBytes;
@ -8894,32 +8895,25 @@ void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
int maxPriority = ioLock->maxPriority();
if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners());
e->detail("AwaitReads", ioLock->totalWaiters());
e->detail("IOActiveTotal", ioLock->getRunnersCount());
e->detail("IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
e->detail(format("IOActiveP%d", priority), ioLock->getRunnersCount(priority));
e->detail(format("IOWaitingP%d", priority), ioLock->getWaitersCount(priority));
}
}
if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
*s += format("%-15s %-8u ", "IOActiveTotal", ioLock->getRunnersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
*s += format("IOActiveP%-6d %-8u ", priority, ioLock->getRunnersCount(priority));
}
*s += "\n";
*s += format("%-15s %-8u ", "IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
*s += format("IOWaitingP%-5d %-8u ", priority, ioLock->getWaitersCount(priority));
}
}
}
@ -11407,57 +11401,3 @@ TEST_CASE(":/redwood/performance/histograms") {
return Void();
}
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}

View File

@ -162,10 +162,7 @@ ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> b
ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider> blobConn);
inline bool isFullRestoreMode() {
return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
};
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef range);
#include "flow/unactorcompiler.h"

View File

@ -30,6 +30,7 @@
struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality;
UID uniqueID;
StorageServerInterface ssi;
@ -48,7 +49,7 @@ struct BlobMigratorInterface {
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, locality, uniqueID, haltBlobMigrator);
serializer(ar, locality, uniqueID, haltBlobMigrator, waitFailure);
}
};

View File

@ -144,6 +144,7 @@ public:
Future<Void> clientCounter;
int clientCount;
AsyncVar<bool> blobGranulesEnabled;
AsyncVar<bool> blobRestoreEnabled;
ClusterType clusterType = ClusterType::STANDALONE;
Optional<ClusterName> metaclusterName;
Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
@ -159,7 +160,7 @@ public:
TaskPriority::DefaultEndpoint,
LockAware::True)), // SOMEDAY: Locality!
unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientCount(0),
blobGranulesEnabled(config.blobGranulesEnabled) {
blobGranulesEnabled(config.blobGranulesEnabled), blobRestoreEnabled(false) {
clientCounter = countClients(this);
}

View File

@ -60,6 +60,7 @@ class GrvProxyTagThrottler {
void setRate(double rate);
bool isMaxThrottled(double maxThrottleDuration) const;
void rejectRequests(LatencyBandsMap&);
void endReleaseWindow(int64_t numStarted, double elapsed);
};
// Track the budgets for each tag

View File

@ -55,7 +55,7 @@ public:
// Updates the budget to accumulate any extra capacity available or remove any excess that was used.
// Call at the end of a release window.
void endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed);
void endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed);
// Smoothly sets rate. If currently disabled, reenable
void setRate(double rate);

View File

@ -35,8 +35,9 @@ typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;
struct Storage {
int64_t quota = std::numeric_limits<int64_t>::max();
int64_t usage = 0;
std::unordered_set<TenantName> tenants;
};
typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
typedef std::unordered_map<TenantGroupName, Storage> TenantStorageMap;
struct TenantCacheTenantCreated {
KeyRange keys;
@ -56,7 +57,8 @@ private:
uint64_t generation;
TenantMapByPrefix tenantCache;
// Map from tenant names to storage quota and usage
// Map from tenant group names to the list of tenants, cumumlative storage used by
// all the tenants in the group, and its storage quota.
TenantStorageMap tenantStorageMap;
// mark the start of a new sweep of the tenant cache

View File

@ -435,6 +435,7 @@ struct StorageServerDisk {
// The following are pointers to the Counters in StorageServer::counters of the same names.
Counter* kvCommitLogicalBytes;
Counter* kvClearRanges;
Counter* kvClearSingleKey;
Counter* kvGets;
Counter* kvScans;
Counter* kvCommits;
@ -1109,15 +1110,13 @@ public:
FlowLock serveFetchCheckpointParallelismLock;
PriorityMultiLock ssLock;
Reference<PriorityMultiLock> ssLock;
std::vector<int> readPriorityRanks;
Future<PriorityMultiLock::Lock> getReadLock(const Optional<ReadOptions>& options) {
// TODO: Fix perf regression in 100% cache read case where taking this lock adds too much overhead
return PriorityMultiLock::Lock();
// int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
// readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
// return ssLock.lock(readPriorityRanks[readType]);
int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
return ssLock->lock(readPriorityRanks[readType]);
}
FlowLock serveAuditStorageParallelismLock;
@ -1172,6 +1171,8 @@ public:
Counter kvCommitLogicalBytes;
// Count of all clearRange operatons to the storage engine.
Counter kvClearRanges;
// Count of all clearRange operations on a singlekeyRange(key delete) to the storage engine.
Counter kvClearSingleKey;
// ClearRange operations issued by FDB, instead of from users, e.g., ClearRange operations to remove a shard
// from a storage server, as in removeDataRange().
Counter kvSystemClearRanges;
@ -1247,8 +1248,8 @@ public:
feedVersionQueries("FeedVersionQueries", cc), bytesInput("BytesInput", cc),
logicalBytesInput("LogicalBytesInput", cc), logicalBytesMoveInOverhead("LogicalBytesMoveInOverhead", cc),
kvCommitLogicalBytes("KVCommitLogicalBytes", cc), kvClearRanges("KVClearRanges", cc),
kvSystemClearRanges("KVSystemClearRanges", cc), bytesDurable("BytesDurable", cc),
bytesFetched("BytesFetched", cc), mutationBytes("MutationBytes", cc),
kvClearSingleKey("KVClearSingleKey", cc), kvSystemClearRanges("KVSystemClearRanges", cc),
bytesDurable("BytesDurable", cc), bytesFetched("BytesFetched", cc), mutationBytes("MutationBytes", cc),
feedBytesFetched("FeedBytesFetched", cc), sampledBytesCleared("SampledBytesCleared", cc),
kvFetched("KVFetched", cc), mutations("Mutations", cc), setMutations("SetMutations", cc),
clearRangeMutations("ClearRangeMutations", cc), atomicMutations("AtomicMutations", cc),
@ -1404,7 +1405,8 @@ public:
fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
ssLock(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES),
ssLock(makeReference<PriorityMultiLock>(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY,
SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES)),
serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM),
instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0),
@ -1412,7 +1414,7 @@ public:
busiestWriteTagContext(ssi.id()), counters(this),
storageServerSourceTLogIDEventHolder(
makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ',');
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READTYPE_PRIORITY_MAP, ',');
ASSERT(readPriorityRanks.size() > (int)ReadType::MAX);
version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
@ -1431,6 +1433,7 @@ public:
this->storage.kvCommitLogicalBytes = &counters.kvCommitLogicalBytes;
this->storage.kvClearRanges = &counters.kvClearRanges;
this->storage.kvClearSingleKey = &counters.kvClearSingleKey;
this->storage.kvGets = &counters.kvGets;
this->storage.kvScans = &counters.kvScans;
this->storage.kvCommits = &counters.kvCommits;
@ -4762,7 +4765,6 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
Arena* pArena,
int matchIndex,
bool isRangeQuery,
bool isBoundary,
KeyValueRef* it,
MappedKeyValueRef* kvm,
Key mappedKey) {
@ -4770,31 +4772,42 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
// Use the mappedKey as the prefix of the range query.
GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq));
if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) ||
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) {
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY) || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key;
kvm->value = it->value;
}
kvm->boundaryAndExist = isBoundary && !getRange.result.empty();
kvm->reqAndResult = getRange;
} else {
GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq));
kvm->reqAndResult = getValue;
kvm->boundaryAndExist = isBoundary && getValue.result.present();
}
return Void();
}
int getMappedKeyValueSize(MappedKeyValueRef mappedKeyValue) {
auto& reqAndResult = mappedKeyValue.reqAndResult;
int bytes = 0;
if (std::holds_alternative<GetValueReqAndResultRef>(reqAndResult)) {
const auto& getValue = std::get<GetValueReqAndResultRef>(reqAndResult);
bytes = getValue.expectedSize();
} else if (std::holds_alternative<GetRangeReqAndResultRef>(reqAndResult)) {
const auto& getRange = std::get<GetRangeReqAndResultRef>(reqAndResult);
bytes = getRange.result.expectedSize();
} else {
throw internal_error();
}
return bytes;
}
ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
GetKeyValuesReply input,
StringRef mapper,
// To provide span context, tags, debug ID to underlying lookups.
GetMappedKeyValuesRequest* pOriginalReq,
Optional<Key> tenantPrefix,
int matchIndex) {
int matchIndex,
int* remainingLimitBytes) {
state GetMappedKeyValuesReply result;
result.version = input.version;
result.more = input.more;
result.cached = input.cached;
result.arena.dependsOn(input.arena);
@ -4823,22 +4836,15 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(),
"storageserver.mapKeyValues.BeforeLoop");
for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
for (; offset<sz&& * remainingLimitBytes> 0; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
// Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
KeyValueRef* it = &input.data[i + offset];
MappedKeyValueRef* kvm = &kvms[i];
bool isBoundary = (i + offset) == 0 || (i + offset) == sz - 1;
// need to keep the boundary, so that caller can use it as a continuation.
if (isBoundary || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key;
kvm->value = it->value;
} else {
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
}
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
Key mappedKey = constructMappedKey(it, vt, mappedKeyFormatTuple);
// Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously.
result.arena.dependsOn(mappedKey.arena());
@ -4846,16 +4852,8 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
// std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value)
// << ", mappedKey:" << printable(mappedKey) << std::endl;
subqueries.push_back(mapSubquery(data,
input.version,
pOriginalReq,
&result.arena,
matchIndex,
isRangeQuery,
isBoundary,
it,
kvm,
mappedKey));
subqueries.push_back(mapSubquery(
data, input.version, pOriginalReq, &result.arena, matchIndex, isRangeQuery, it, kvm, mappedKey));
}
wait(waitForAll(subqueries));
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
@ -4864,9 +4862,31 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
"storageserver.mapKeyValues.AfterBatch");
subqueries.clear();
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
// since we always read the index, so always consider the index size
int indexSize = sizeof(KeyValueRef) + input.data[i + offset].expectedSize();
int size = indexSize + getMappedKeyValueSize(kvms[i]);
*remainingLimitBytes -= size;
result.data.push_back(result.arena, kvms[i]);
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT && *remainingLimitBytes <= 0) {
break;
}
}
}
int resultSize = result.data.size();
if (resultSize > 0) {
// keep index for boundary index entries, so that caller can use it as a continuation.
result.data[0].key = input.data[0].key;
result.data[0].value = input.data[0].value;
result.data[0].boundaryAndExist = getMappedKeyValueSize(kvms[0]) > 0;
result.data.back().key = input.data[resultSize - 1].key;
result.data.back().value = input.data[resultSize - 1].value;
// index needs to be -1
int index = (resultSize - 1) % SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
result.data.back().boundaryAndExist = getMappedKeyValueSize(kvms[index]) > 0;
}
result.more = input.more || resultSize < sz;
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(),
@ -5121,12 +5141,15 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
req.reply.send(none);
} else {
state int remainingLimitBytes = req.limitBytes;
// create a temporary byte limit for index fetching ONLY, this should be excessive
// because readRange is cheap when reading additional bytes
state int bytesForIndex =
std::min(req.limitBytes, (int)(req.limitBytes * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH));
GetKeyValuesReply getKeyValuesReply = wait(readRange(data,
version,
KeyRangeRef(begin, end),
req.limit,
&remainingLimitBytes,
&bytesForIndex,
span.context,
req.options,
tenantPrefix));
@ -5140,9 +5163,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
try {
// Map the scanned range to another list of keys and look up.
GetMappedKeyValuesReply _r =
wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix, req.matchIndex));
wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, req.matchIndex, &remainingLimitBytes));
r = _r;
} catch (Error& e) {
// catch txn_too_old here if prefetch runs for too long, and returns it back to client
TraceEvent("MapError").error(e);
throw;
}
@ -6138,6 +6162,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> tryReadBlobGranules(Tra
loop {
try {
Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tr->readBlobGranules(keys, 0, readVersion));
TraceEvent(SevDebug, "ReadBlobGranules").detail("Keys", keys).detail("Chunks", chunks.size());
return chunks;
} catch (Error& e) {
if (retryCount >= maxRetryCount) {
@ -6169,10 +6194,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
for (i = 0; i < chunks.size(); ++i) {
state KeyRangeRef chunkRange = chunks[i].keyRange;
state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn));
TraceEvent("ReadBlobData")
.detail("Rows", rows.size())
.detail("ChunkRange", chunkRange.toString())
.detail("Keys", keys.toString());
TraceEvent(SevDebug, "ReadBlobData").detail("Rows", rows.size()).detail("ChunkRange", chunkRange);
if (rows.size() == 0) {
rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end));
}
@ -6185,7 +6207,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
} catch (Error& e) {
TraceEvent(SevWarn, "ReadBlobDataFailure")
.suppressFor(5.0)
.detail("Keys", keys.toString())
.detail("Keys", keys)
.detail("FetchVersion", fetchVersion)
.detail("Error", e.what());
tr->reset();
@ -6994,7 +7016,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
// We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure
// change feed mutations get applied correctly
state std::vector<Key> changeFeedsToFetch;
if (!isFullRestoreMode()) {
state bool isFullRestore = wait(isFullRestoreMode(data->cx, keys));
if (!isFullRestore) {
std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
changeFeedsToFetch = _cfToFetch;
}
@ -7072,7 +7095,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
state PromiseStream<RangeResult> results;
state Future<Void> hold;
if (SERVER_KNOBS->FETCH_USING_BLOB) {
if (isFullRestore) {
hold = tryGetRangeFromBlob(results, &tr, keys, fetchVersion, data->blobConn);
} else {
hold = tryGetRange(results, &tr, keys);
@ -7110,7 +7133,6 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
data->thisServerID);
}
}
metricReporter.addFetchedBytes(expectedBlockSize, this_block.size());
// Write this_block to storage
@ -9703,6 +9725,9 @@ void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned)
void StorageServerDisk::clearRange(KeyRangeRef keys) {
storage->clear(keys, &data->metrics);
++(*kvClearRanges);
if (keys.singleKeyRange()) {
++(*kvClearSingleKey);
}
}
void StorageServerDisk::writeKeyValue(KeyValueRef kv) {
@ -9717,6 +9742,9 @@ void StorageServerDisk::writeMutation(MutationRef mutation) {
} else if (mutation.type == MutationRef::ClearRange) {
storage->clear(KeyRangeRef(mutation.param1, mutation.param2), &data->metrics);
++(*kvClearRanges);
if (KeyRangeRef(mutation.param1, mutation.param2).singleKeyRange()) {
++(*kvClearSingleKey);
}
} else
ASSERT(false);
}
@ -9732,6 +9760,9 @@ void StorageServerDisk::writeMutations(const VectorRef<MutationRef>& mutations,
} else if (m.type == MutationRef::ClearRange) {
storage->clear(KeyRangeRef(m.param1, m.param2), &data->metrics);
++(*kvClearRanges);
if (KeyRangeRef(m.param1, m.param2).singleKeyRange()) {
++(*kvClearSingleKey);
}
}
}
}
@ -10399,20 +10430,20 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString());
std::vector<int> rpr = self->readPriorityRanks;
te.detail("ReadsActive", self->ssLock.totalRunners());
te.detail("ReadsWaiting", self->ssLock.totalWaiters());
te.detail("ReadsTotalActive", self->ssLock->getRunnersCount());
te.detail("ReadsTotalWaiting", self->ssLock->getWaitersCount());
int type = (int)ReadType::FETCH;
te.detail("ReadFetchActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadFetchActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::LOW;
te.detail("ReadLowActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadLowActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::NORMAL;
te.detail("ReadNormalActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadNormalActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::HIGH;
te.detail("ReadHighActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadHighActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock->getWaitersCount(rpr[type]));
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
@ -11228,7 +11259,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
// If the storage server dies while something that uses self is still on the stack,
// we want that actor to complete before we terminate and that memory goes out of scope
self.ssLock.kill();
self.ssLock->kill();
state Error err = e;
if (storageServerTerminated(self, persistentData, err)) {
@ -11326,7 +11357,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
throw internal_error();
} catch (Error& e) {
self.ssLock.kill();
self.ssLock->kill();
if (self.byteSampleRecovery.isValid()) {
self.byteSampleRecovery.cancel();

View File

@ -2335,6 +2335,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
} else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.haltBlobMigrator);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.ssi.getValue);
DUMPTOKEN(recruited.ssi.getKey);
DUMPTOKEN(recruited.ssi.getKeyValues);
@ -2345,7 +2346,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
DUMPTOKEN(recruited.ssi.getReadHotRanges);
DUMPTOKEN(recruited.ssi.getRangeSplitPoints);
DUMPTOKEN(recruited.ssi.getStorageMetrics);
DUMPTOKEN(recruited.ssi.waitFailure);
DUMPTOKEN(recruited.ssi.getQueuingMetrics);
DUMPTOKEN(recruited.ssi.getKeyValueStoreType);
DUMPTOKEN(recruited.ssi.watchValue);

View File

@ -20,7 +20,9 @@
#include <cstdint>
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -28,9 +30,13 @@
struct CreateTenantWorkload : TestWorkload {
static constexpr auto NAME = "CreateTenant";
TenantName tenant;
Optional<TenantGroupName> tenantGroup;
CreateTenantWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
tenant = getOption(options, "name"_sr, "DefaultTenant"_sr);
if (hasOption(options, "group"_sr)) {
tenantGroup = getOption(options, "group"_sr, "DefaultGroup"_sr);
}
}
Future<Void> setup(Database const& cx) override {
@ -46,7 +52,12 @@ struct CreateTenantWorkload : TestWorkload {
ACTOR static Future<Void> _setup(CreateTenantWorkload* self, Database db) {
try {
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant));
TenantMapEntry givenEntry;
if (self->tenantGroup.present()) {
givenEntry.tenantGroup = self->tenantGroup.get();
givenEntry.encrypted = SERVER_KNOBS->ENABLE_ENCRYPTION;
}
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant, givenEntry));
ASSERT(entry.present());
} catch (Error& e) {
TraceEvent(SevError, "TenantCreationFailed").error(e);

View File

@ -38,6 +38,8 @@ const KeyRef prefix = "prefix"_sr;
const KeyRef RECORD = "RECORD"_sr;
const KeyRef INDEX = "INDEX"_sr;
int recordSize;
int indexSize;
struct GetMappedRangeWorkload : ApiWorkload {
static constexpr auto NAME = "GetMappedRange";
bool enabled;
@ -93,19 +95,32 @@ struct GetMappedRangeWorkload : ApiWorkload {
loop {
std::cout << "start fillInRecords n=" << n << std::endl;
// TODO: When n is large, split into multiple transactions.
recordSize = 0;
indexSize = 0;
try {
for (int i = 0; i < n; i++) {
if (self->SPLIT_RECORDS) {
for (int split = 0; split < SPLIT_SIZE; split++) {
tr.set(recordKey(i, split), recordValue(i, split));
if (i == 0) {
recordSize +=
recordKey(i, split).size() + recordValue(i, split).size() + sizeof(KeyValueRef);
}
}
} else {
tr.set(recordKey(i), recordValue(i));
if (i == 0) {
recordSize += recordKey(i).size() + recordValue(i).size() + sizeof(KeyValueRef);
}
}
tr.set(indexEntryKey(i), EMPTY);
if (i == 0) {
indexSize += indexEntryKey(i).size() + sizeof(KeyValueRef);
}
}
wait(tr.commit());
std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << std::endl;
std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << " recordSize "
<< recordSize << " indexSize " << indexSize << std::endl;
break;
} catch (Error& e) {
std::cout << "failed fillInRecords, retry" << std::endl;
@ -146,8 +161,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
int matchIndex,
bool isBoundary,
bool allMissing) {
// std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key) << "
// indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId)) << std::endl;
// std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key)
// << " indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId))
// << " matchIndex: " << matchIndex << std::endl;
if (matchIndex == MATCH_INDEX_ALL || isBoundary) {
ASSERT(it->key == indexEntryKey(expectedId));
} else if (matchIndex == MATCH_INDEX_MATCHED_ONLY) {
@ -163,7 +179,6 @@ struct GetMappedRangeWorkload : ApiWorkload {
ASSERT(std::holds_alternative<GetRangeReqAndResultRef>(it->reqAndResult));
auto& getRange = std::get<GetRangeReqAndResultRef>(it->reqAndResult);
auto& rangeResult = getRange.result;
ASSERT(it->boundaryAndExist == (isBoundary && !rangeResult.empty()));
// std::cout << "rangeResult.size()=" << rangeResult.size() << std::endl;
// In the future, we may be able to do the continuation more efficiently by combining partial results
// together and then validate.
@ -200,6 +215,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
KeySelector endSelector,
Key mapper,
int limit,
int byteLimit,
int expectedBeginId,
GetMappedRangeWorkload* self,
int matchIndex,
@ -207,14 +223,16 @@ struct GetMappedRangeWorkload : ApiWorkload {
std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString()
<< " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId
<< " limit:" << limit << std::endl;
<< " limit:" << limit << " byteLimit: " << byteLimit << " recordSize: " << recordSize
<< " STRICTLY_ENFORCE_BYTE_LIMIT: " << SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT << " allMissing "
<< allMissing << std::endl;
loop {
state Reference<TransactionWrapper> tr = self->createTransaction();
try {
MappedRangeResult result = wait(tr->getMappedRange(beginSelector,
endSelector,
mapper,
GetRangeLimits(limit),
GetRangeLimits(limit, byteLimit),
matchIndex,
self->snapshot,
Reverse::False));
@ -270,17 +288,51 @@ struct GetMappedRangeWorkload : ApiWorkload {
Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
state int limit = 100;
state int byteLimit = deterministicRandom()->randomInt(1, 9) * 10000;
state int expectedBeginId = beginId;
std::cout << "ByteLimit: " << byteLimit << " limit: " << limit
<< " FRACTION_INDEX_BYTELIMIT_PREFETCH: " << SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH
<< " MAX_PARALLEL_QUICK_GET_VALUE: " << SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE << std::endl;
while (true) {
MappedRangeResult result = wait(self->scanMappedRangeWithLimits(
cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self, matchIndex, allMissing));
MappedRangeResult result = wait(self->scanMappedRangeWithLimits(cx,
beginSelector,
endSelector,
mapper,
limit,
byteLimit,
expectedBeginId,
self,
matchIndex,
allMissing));
expectedBeginId += result.size();
if (result.more) {
if (result.empty()) {
// This is usually not expected.
std::cout << "not result but have more, try again" << std::endl;
} else {
// auto& reqAndResult = std::get<GetRangeReqAndResultRef>(result.back().reqAndResult);
int size = allMissing ? indexSize : (indexSize + recordSize);
int expectedCnt = limit;
int indexByteLimit = byteLimit * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH;
int indexCountByteLimit = indexByteLimit / indexSize + (indexByteLimit % indexSize != 0);
int indexCount = std::min(limit, indexCountByteLimit);
std::cout << "indexCount: " << indexCount << std::endl;
// result set cannot be larger than the number of index fetched
ASSERT(result.size() <= indexCount);
expectedCnt = std::min(expectedCnt, indexCount);
int boundByRecord;
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT) {
// might have 1 additional entry over the limit
boundByRecord = byteLimit / size + (byteLimit % size != 0);
} else {
// might have 1 additional batch over the limit
int roundSize = size * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
int round = byteLimit / roundSize + (byteLimit % roundSize != 0);
boundByRecord = round * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
}
expectedCnt = std::min(expectedCnt, boundByRecord);
std::cout << "boundByRecord: " << boundByRecord << std::endl;
ASSERT(result.size() == expectedCnt);
beginSelector = KeySelector(firstGreaterThan(result.back().key));
}
} else {
@ -289,6 +341,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
}
}
ASSERT(expectedBeginId == endId);
return Void();
}
@ -433,6 +486,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
} else if (r < 0.75) {
matchIndex = MATCH_INDEX_UNMATCHED_ONLY;
}
state bool originalStrictlyEnforeByteLimit = SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT;
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
wait(self->scanMappedRange(cx, 10, 490, mapper, self, matchIndex));
{
@ -440,6 +495,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
wait(self->scanMappedRange(cx, 10, 490, mapper, self, MATCH_INDEX_UNMATCHED_ONLY, true));
}
// reset it to default
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = originalStrictlyEnforeByteLimit;
return Void();
}

View File

@ -68,7 +68,17 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
void getMetrics(std::vector<PerfMetric>& m) override {}
// disable the default timeout setting
double getCheckTimeout() const override { return std::numeric_limits<double>::max(); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("RandomMoveKeys"); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
out.insert("RandomMoveKeys");
// Rollback interferes with the
// \xff\xff/worker_interfaces test, since it can
// trigger a cluster recvoery, causing the worker
// interface for a machine to be updated in the middle
// of the test.
out.insert("RollbackWorkload");
}
Future<Void> _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
cx->specialKeySpace = std::make_unique<SpecialKeySpace>();

View File

@ -18,9 +18,10 @@
* limitations under the License.
*/
#include "fdbrpc/TenantName.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbrpc/TenantName.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
@ -31,12 +32,16 @@
struct StorageQuotaWorkload : TestWorkload {
static constexpr auto NAME = "StorageQuota";
TenantGroupName group;
TenantName tenant;
int nodeCount;
TenantName emptyTenant;
StorageQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
nodeCount = getOption(options, "nodeCount"_sr, 10000);
group = getOption(options, "group"_sr, "DefaultGroup"_sr);
tenant = getOption(options, "tenant"_sr, "DefaultTenant"_sr);
nodeCount = getOption(options, "nodeCount"_sr, 10000);
emptyTenant = getOption(options, "emptyTenant"_sr, "DefaultTenant"_sr);
}
Future<Void> setup(Database const& cx) override {
@ -67,27 +72,42 @@ struct StorageQuotaWorkload : TestWorkload {
Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(keyForIndex(n), value((n + 1) % nodeCount)); }
ACTOR Future<Void> _start(StorageQuotaWorkload* self, Database cx) {
// Check that the quota set/get functions work as expected.
// Set the quota to just below the current size.
state TenantMapEntry entry1 = wait(TenantAPI::getTenant(cx.getReference(), self->tenant));
state TenantMapEntry entry2 = wait(TenantAPI::getTenant(cx.getReference(), self->emptyTenant));
ASSERT(entry1.tenantGroup.present() && entry1.tenantGroup.get() == self->group &&
entry2.tenantGroup.present() && entry2.tenantGroup.get() == self->group);
// Get the size of the non-empty tenant. We will set the quota of the tenant group
// to just below the current size of this tenant.
state int64_t size = wait(getSize(cx, self->tenant));
state int64_t quota = size - 1;
wait(setStorageQuotaHelper(cx, self->tenant, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->tenant));
// Check that the quota set/get functions work as expected.
wait(setStorageQuotaHelper(cx, self->group, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->group));
ASSERT(quotaRead.present() && quotaRead.get() == quota);
if (!SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED) {
if (!SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
return Void();
}
// Check that writes are rejected when the tenant is over quota.
state bool rejected = wait(tryWrite(self, cx, /*expectOk=*/false));
ASSERT(rejected);
// Check that writes to both the tenants are rejected when the group is over quota.
state bool rejected1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/false));
ASSERT(rejected1);
state bool rejected2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/false));
ASSERT(rejected2);
// Increase the quota. Check that writes are now able to commit.
quota = size * 2;
wait(setStorageQuotaHelper(cx, self->tenant, quota));
state bool committed = wait(tryWrite(self, cx, /*expectOk=*/true));
ASSERT(committed);
// Increase the quota or clear the quota. Check that writes to both the tenants are now able to commit.
if (deterministicRandom()->coinflip()) {
quota = size * 2;
wait(setStorageQuotaHelper(cx, self->group, quota));
} else {
wait(clearStorageQuotaHelper(cx, self->group));
}
state bool committed1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/true));
ASSERT(committed1);
state bool committed2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/true));
ASSERT(committed2);
return Void();
}
@ -115,11 +135,11 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantName tenantName, int64_t quota) {
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName, int64_t quota) {
state Transaction tr(cx);
loop {
try {
setStorageQuota(tr, tenantName, quota);
setStorageQuota(tr, tenantGroupName, quota);
wait(tr.commit());
return Void();
} catch (Error& e) {
@ -128,12 +148,24 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantName tenantName) {
ACTOR static Future<Void> clearStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx);
loop {
try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName));
clearStorageQuota(tr, tenantGroupName);
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx);
loop {
try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantGroupName));
return quota;
} catch (Error& e) {
wait(tr.onError(e));
@ -141,13 +173,13 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, bool expectOk) {
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, TenantName tenant, bool expectOk) {
state int i;
// Retry the transaction a few times if needed; this allows us wait for a while for all
// the storage usage and quota related monitors to fetch and propagate the latest information
// about the tenants that are over storage quota.
for (i = 0; i < 10; i++) {
state Transaction tr(cx, self->tenant);
state Transaction tr(cx, tenant);
loop {
try {
Standalone<KeyValueRef> kv =

View File

@ -118,14 +118,14 @@ Arena::Arena(Arena&& r) noexcept = default;
Arena& Arena::operator=(const Arena& r) = default;
Arena& Arena::operator=(Arena&& r) noexcept = default;
void Arena::dependsOn(const Arena& p) {
if (p.impl) {
// x.dependsOn(y) is a no-op if they refer to the same ArenaBlocks.
// They will already have the same lifetime.
if (p.impl && p.impl.getPtr() != impl.getPtr()) {
allowAccess(impl.getPtr());
allowAccess(p.impl.getPtr());
ArenaBlock::dependOn(impl, p.impl.getPtr());
disallowAccess(p.impl.getPtr());
if (p.impl.getPtr() != impl.getPtr()) {
disallowAccess(impl.getPtr());
}
disallowAccess(impl.getPtr());
}
}
@ -297,6 +297,7 @@ void* ArenaBlock::make4kAlignedBuffer(uint32_t size) {
}
void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
ASSERT(self->getData() != other->getData());
other->addref();
if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef))
create(SMALL, self)->makeReference(other);
@ -775,6 +776,16 @@ TEST_CASE("/flow/Arena/Size") {
return Void();
}
// Test that x.dependsOn(x) works, and is effectively a no-op.
TEST_CASE("/flow/Arena/SelfRef") {
Arena a(4096);
// This should be a no-op.
a.dependsOn(a);
return Void();
}
TEST_CASE("flow/StringRef/eat") {
StringRef str = "test/case"_sr;
StringRef first = str.eat("/");
@ -815,4 +826,4 @@ TEST_CASE("flow/StringRef/eat") {
ASSERT(str == ""_sr);
return Void();
}
}

View File

@ -29,21 +29,25 @@
#define PRIORITYMULTILOCK_ACTOR_H
#include "flow/flow.h"
#include <boost/intrusive/list.hpp>
#include "flow/actorcompiler.h" // This must be the last #include.
#define PRIORITYMULTILOCK_DEBUG 0
#if PRIORITYMULTILOCK_DEBUG || !defined(NO_INTELLISENSE)
#define pml_debug_printf(...) \
if (now() > 0) \
printf(__VA_ARGS__)
if (now() > 0) { \
printf("pml line=%04d ", __LINE__); \
printf(__VA_ARGS__); \
}
#else
#define pml_debug_printf(...)
#endif
// A multi user lock with a concurrent holder limit where waiters request a lock with a priority
// id and are granted locks based on a total concurrency and relative weights of the current active
// priorities. Priority id's must start at 0 and are sequential integers.
// priorities. Priority id's must start at 0 and are sequential integers. Priority id numbers
// are not related to the importance of the priority in execution.
//
// Scheduling logic
// Let
@ -64,17 +68,17 @@
// The interface is similar to FlowMutex except that lock holders can just drop the lock to release it.
//
// Usage:
// Lock lock = wait(prioritylock.lock(priorityLevel));
// Lock lock = wait(prioritylock.lock(priority_id));
// lock.release(); // Explicit release, or
// // let lock and all copies of lock go out of scope to release
class PriorityMultiLock {
class PriorityMultiLock : public ReferenceCounted<PriorityMultiLock> {
public:
// Waiting on the lock returns a Lock, which is really just a Promise<Void>
// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
// the Lock before it goes out of scope.
struct Lock {
void release() { promise.send(Void()); }
bool isLocked() const { return promise.canBeSet(); }
// This is exposed in case the caller wants to use/copy it directly
Promise<Void> promise;
@ -84,10 +88,11 @@ public:
: PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {}
PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority)
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0), releaseDebugID(0) {
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0) {
priorities.resize(weightsByPriority.size());
for (int i = 0; i < priorities.size(); ++i) {
priorities[i].priority = i;
priorities[i].weight = weightsByPriority[i];
}
@ -102,7 +107,8 @@ public:
// If this priority currently has no waiters
if (q.empty()) {
// Add this priority's weight to the total for priorities with pending work
// Add this priority's weight to the total for priorities with pending work. This must be done
// so that currenctCapacity() below will assign capacaity to this priority.
totalPendingWeights += p.weight;
// If there are slots available and the priority has capacity then don't make the caller wait
@ -114,80 +120,69 @@ public:
Lock lock;
addRunner(lock, &p);
pml_debug_printf("lock nowait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock nowait priority %d %s\n", priority, toString().c_str());
return lock;
}
// If we didn't return above then add the priority to the waitingPriorities list
waitingPriorities.push_back(p);
}
Waiter w;
q.push_back(w);
Waiter& w = q.emplace_back();
++waiting;
pml_debug_printf("lock wait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock wait priority %d %s\n", priority, toString().c_str());
return w.lockPromise.getFuture();
}
void kill() {
pml_debug_printf("kill %s\n", toString().c_str());
brokenOnDestruct.reset();
// handleRelease will not free up any execution slots when it ends via cancel
fRunner.cancel();
available = 0;
runners.clear();
waitingPriorities.clear();
priorities.clear();
}
std::string toString() const {
int runnersDone = 0;
for (int i = 0; i < runners.size(); ++i) {
if (runners[i].isReady()) {
++runnersDone;
}
}
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d "
"runnersDone=%d pendingWeights=%d ",
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d "
"pendingWeights=%d ",
this,
concurrency,
available,
concurrency - available,
waiting,
runners.size(),
runnersDone,
totalPendingWeights);
for (int i = 0; i < priorities.size(); ++i) {
s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str());
for (auto& p : priorities) {
s += format("{%s} ", p.toString(this).c_str());
}
s += "}";
if (concurrency - available != runners.size() - runnersDone) {
pml_debug_printf("%s\n", s.c_str());
ASSERT_EQ(concurrency - available, runners.size() - runnersDone);
}
return s;
}
int maxPriority() const { return priorities.size() - 1; }
int totalWaiters() const { return waiting; }
int getRunnersCount() const { return concurrency - available; }
int getWaitersCount() const { return waiting; }
int numWaiters(const unsigned int priority) const {
int getWaitersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].queue.size();
}
int totalRunners() const { return concurrency - available; }
int numRunners(const unsigned int priority) const {
int getRunnersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].runners;
}
private:
struct Waiter {
Waiter() {}
Promise<Lock> lockPromise;
};
@ -202,8 +197,8 @@ private:
typedef Deque<Waiter> Queue;
struct Priority {
Priority() : runners(0), weight(0) {}
struct Priority : boost::intrusive::list_base_hook<> {
Priority() : runners(0), weight(0), priority(-1) {}
// Queue of waiters at this priority
Queue queue;
@ -211,9 +206,12 @@ private:
int runners;
// Configured weight for this priority
int weight;
// Priority number for convenience, matches *this's index in PML priorities vector
int priority;
std::string toString(const PriorityMultiLock* pml) const {
return format("weight=%d run=%d wait=%d cap=%d",
return format("priority=%d weight=%d run=%d wait=%d cap=%d",
priority,
weight,
runners,
queue.size(),
@ -222,51 +220,41 @@ private:
};
std::vector<Priority> priorities;
typedef boost::intrusive::list<Priority, boost::intrusive::constant_time_size<false>> WaitingPrioritiesList;
// Current or recent (ended) runners
Deque<Future<Void>> runners;
// List of all priorities with 1 or more waiters. This list exists so that the scheduling loop
// does not have to iterage over the priorities vector checking priorities without waiters.
WaitingPrioritiesList waitingPriorities;
Future<Void> fRunner;
AsyncTrigger wakeRunner;
Promise<Void> brokenOnDestruct;
// Used for debugging, can roll over without issue
unsigned int releaseDebugID;
ACTOR static Future<Void> handleRelease(PriorityMultiLock* self, Future<Void> f, Priority* priority) {
state [[maybe_unused]] unsigned int id = self->releaseDebugID++;
pml_debug_printf("%f handleRelease self=%p id=%u start \n", now(), self, id);
ACTOR static void handleRelease(Reference<PriorityMultiLock> self, Priority* priority, Future<Void> holder) {
pml_debug_printf("%f handleRelease self=%p start\n", now(), self.getPtr());
try {
wait(f);
pml_debug_printf("%f handleRelease self=%p id=%u success\n", now(), self, id);
wait(holder);
pml_debug_printf("%f handleRelease self=%p success\n", now(), self.getPtr());
} catch (Error& e) {
pml_debug_printf("%f handleRelease self=%p id=%u error %s\n", now(), self, id, e.what());
if (e.code() == error_code_actor_cancelled) {
throw;
}
pml_debug_printf("%f handleRelease self=%p error %s\n", now(), self.getPtr(), e.what());
}
pml_debug_printf("lock release line %d priority %d %s\n",
__LINE__,
(int)(priority - &self->priorities.front()),
self->toString().c_str());
pml_debug_printf("lock release priority %d %s\n", (int)(priority->priority), self->toString().c_str());
pml_debug_printf("%f handleRelease self=%p id=%u releasing\n", now(), self, id);
pml_debug_printf("%f handleRelease self=%p releasing\n", now(), self.getPtr());
++self->available;
priority->runners -= 1;
// If there are any waiters or if the runners array is getting large, trigger the runner loop
if (self->waiting > 0 || self->runners.size() > 1000) {
if (self->waiting > 0) {
self->wakeRunner.trigger();
}
return Void();
}
void addRunner(Lock& lock, Priority* p) {
p->runners += 1;
void addRunner(Lock& lock, Priority* priority) {
priority->runners += 1;
--available;
runners.push_back(handleRelease(this, lock.promise.getFuture(), p));
handleRelease(Reference<PriorityMultiLock>::addRef(this), priority, lock.promise.getFuture());
}
// Current maximum running tasks for the specified priority, which must have waiters
@ -278,76 +266,50 @@ private:
}
ACTOR static Future<Void> runner(PriorityMultiLock* self) {
state int sinceYield = 0;
state Future<Void> error = self->brokenOnDestruct.getFuture();
// Priority to try to run tasks from next
state int priority = 0;
state WaitingPrioritiesList::iterator p = self->waitingPriorities.end();
loop {
pml_debug_printf(
"runner loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
// Cleanup finished runner futures at the front of the runner queue.
while (!self->runners.empty() && self->runners.front().isReady()) {
self->runners.pop_front();
}
pml_debug_printf("runner loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Wait for a runner to release its lock
pml_debug_printf(
"runner loop waitTrigger line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
pml_debug_printf("runner loop waitTrigger priority=%d %s\n", p->priority, self->toString().c_str());
wait(self->wakeRunner.onTrigger());
pml_debug_printf(
"%f runner loop wake line %d priority=%d %s\n", now(), __LINE__, priority, self->toString().c_str());
if (++sinceYield == 100) {
sinceYield = 0;
pml_debug_printf(
" runner waitDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
wait(delay(0));
pml_debug_printf(
" runner afterDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
}
pml_debug_printf("%f runner loop wake priority=%d %s\n", now(), p->priority, self->toString().c_str());
// While there are available slots and there are waiters, launch tasks
while (self->available > 0 && self->waiting > 0) {
pml_debug_printf(
" launch loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
Priority* pPriority;
pml_debug_printf(" launch loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Find the next priority with waiters and capacity. There must be at least one.
loop {
// Rotate to next priority
if (++priority == self->priorities.size()) {
priority = 0;
if (p == self->waitingPriorities.end()) {
p = self->waitingPriorities.begin();
}
pPriority = &self->priorities[priority];
pml_debug_printf(" launch loop scan priority=%d %s\n", p->priority, self->toString().c_str());
pml_debug_printf(" launch loop scan line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
if (!pPriority->queue.empty() && pPriority->runners < self->currentCapacity(pPriority->weight)) {
if (!p->queue.empty() && p->runners < self->currentCapacity(p->weight)) {
break;
}
++p;
}
Queue& queue = pPriority->queue;
Queue& queue = p->queue;
Waiter w = queue.front();
queue.pop_front();
// If this priority is now empty, subtract its weight from the total pending weights
// If this priority is now empty, subtract its weight from the total pending weights an remove it
// from the waitingPriorities list
Priority* pPriority = &*p;
if (queue.empty()) {
p = self->waitingPriorities.erase(p);
self->totalPendingWeights -= pPriority->weight;
pml_debug_printf(" emptied priority line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
pml_debug_printf(
" emptied priority priority=%d %s\n", pPriority->priority, self->toString().c_str());
}
--self->waiting;
@ -365,10 +327,9 @@ private:
self->addRunner(lock, pPriority);
}
pml_debug_printf(" launched line %d alreadyDone=%d priority=%d %s\n",
__LINE__,
pml_debug_printf(" launched alreadyDone=%d priority=%d %s\n",
!lock.promise.canBeSet(),
priority,
pPriority->priority,
self->toString().c_str());
}
}

View File

@ -0,0 +1,180 @@
/*
* BenchBlobDeltaFiles.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "benchmark/benchmark.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "flow/IRandom.h"
#include "flow/DeterministicRandom.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "flow/flow.h"
#include <cstdlib>
#include <stdexcept>
// Pre-generated GranuleDelta size in bytes for benchmark.
const static int PRE_GEN_TARGET_BYTES[] = { 128 * 1024, 512 * 1024, 1024 * 1024 };
// Generate GranuleDelta using a deterministic way. Change the seed if you would test a new data set
class DeltaGenerator {
public:
DeltaGenerator(uint32_t seed = 12345678) {
randGen = Reference<IRandom>(new DeterministicRandom(seed));
// Generate key range
prefix = StringRef(ar, randGen->randomUniqueID().toString() + "_");
range = KeyRangeRef(prefix, StringRef(ar, strinc(prefix)));
// Generate version jump size
minVersionJump = randGen->randomExp(0, 25);
maxVersionJump = minVersionJump + randGen->randomExp(0, 25);
// Generate value size range
maxValueSize = randGen->randomExp(7, 9);
// Generate start version
version = randGen->randomUInt32();
// Generate probabilty of update existing keys
updateExistingKeysProb = randGen->random01();
// Generate deltas
for (auto i : PRE_GEN_TARGET_BYTES) {
genDeltas(i);
}
fmt::print("key range: {} - {}\n", range.begin.printable(), range.end.printable());
fmt::print("start version: {}\n", version);
fmt::print("max value bytes: {}\n", maxValueSize);
fmt::print("version jump range: {} - {}\n", minVersionJump, maxVersionJump);
fmt::print("probability for update: {}\n", updateExistingKeysProb);
fmt::print("unseed: {}\n", randGen->randomUInt32());
}
KeyRange getRange() { return range; }
Standalone<GranuleDeltas> getDelta(int targetBytes) {
if (deltas.find(targetBytes) != deltas.end()) {
return deltas[targetBytes];
}
throw std::invalid_argument("Test delta file size is not pre-generated!");
}
private:
void genDeltas(int targetBytes) {
Standalone<GranuleDeltas> data;
int totalDataBytes = 0;
while (totalDataBytes < targetBytes) {
data.push_back(ar, newDelta());
totalDataBytes += data.back().expectedSize();
}
deltas[targetBytes] = data;
}
MutationRef newMutation() { return MutationRef(ar, MutationRef::SetValue, key(), value()); }
MutationsAndVersionRef newDelta() {
version += randGen->randomInt(minVersionJump, maxVersionJump);
MutationsAndVersionRef ret(version, version);
for (int i = 0; i < 10; i++) {
ret.mutations.push_back_deep(ar, newMutation());
}
return ret;
}
StringRef key() {
// Pick an existing key
if (randGen->random01() < updateExistingKeysProb && !usedKeys.empty()) {
int r = randGen->randomUInt32() % usedKeys.size();
auto it = usedKeys.begin();
for (; r != 0; r--)
it++;
return StringRef(ar, *it);
}
// Create a new key
std::string key = prefix.toString() + randGen->randomUniqueID().toString();
usedKeys.insert(key);
return StringRef(ar, key);
}
StringRef value() {
int valueSize = randGen->randomInt(maxValueSize / 2, maxValueSize * 3 / 2);
std::string value = randGen->randomUniqueID().toString();
if (value.size() > valueSize) {
value = value.substr(0, valueSize);
}
if (value.size() < valueSize) {
// repeated string so it's compressible
value += std::string(valueSize - value.size(), 'x');
}
return StringRef(ar, value);
}
Reference<IRandom> randGen;
Arena ar;
KeyRangeRef range;
Key prefix;
int maxValueSize;
Version version;
int minVersionJump;
int maxVersionJump;
std::set<std::string> usedKeys;
double updateExistingKeysProb;
std::map<int, Standalone<GranuleDeltas>> deltas;
};
static DeltaGenerator deltaGen; // Pre-generate deltas
// Benchmark serialization without compression/encryption. The main CPU cost should be sortDeltasByKey
static void bench_serialize_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
int chunkSize = state.range(1);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
Standalone<StringRef> fileName = "testdelta"_sr; // unused
Optional<CompressionFilter> compressFilter; // unused. no compression
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx; // unused. no encryption
uint32_t serializedBytes = 0;
for (auto _ : state) {
Value serialized = serializeChunkedDeltaFile(fileName, delta, range, chunkSize, compressFilter, cipherKeysCtx);
serializedBytes += serialized.size();
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
state.counters["serialized_bytes"] = serializedBytes;
}
// Benchmark sorting deltas
static void bench_sort_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
for (auto _ : state) {
sortDeltasByKey(delta, range);
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
}
// Benchmark serialization for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_serialize_deltas)
->Args({ 128 * 1024, 32 * 1024 })
->Args({ 512 * 1024, 32 * 1024 })
->Args({ 1024 * 1024, 32 * 1024 });
// Benchmark sorting for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_sort_deltas)->Args({ 128 * 1024 })->Args({ 512 * 1024 })->Args({ 1024 * 1024 });

View File

@ -25,26 +25,28 @@
#include "flow/PriorityMultiLock.actor.h"
#include <deque>
#include "flow/actorcompiler.h" // This must be the last #include.
#include "fmt/printf.h"
ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
state std::vector<int> priorities;
// Arg1 is the number of active priorities to use
// Arg2 is the number of inactive priorities to use
state int active = benchState->range(0);
state int inactive = benchState->range(1);
// Set up priority list with limits 10, 20, 30, ...
while (priorities.size() < benchState->range(0)) {
state std::vector<int> priorities;
while (priorities.size() < active + inactive) {
priorities.push_back(10 * (priorities.size() + 1));
}
state int concurrency = priorities.size() * 10;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
state Reference<PriorityMultiLock> pml = makeReference<PriorityMultiLock>(concurrency, priorities);
// Clog the lock buy taking concurrency locks
// Clog the lock buy taking n=concurrency locks
state std::deque<Future<PriorityMultiLock::Lock>> lockFutures;
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(j % priorities.size()));
lockFutures.push_back(pml->lock(j % active));
}
// Wait for all of the initial locks to be taken
// This will work regardless of their priorities as there are only n = concurrency of them
wait(waitForAll(std::vector<Future<PriorityMultiLock::Lock>>(lockFutures.begin(), lockFutures.end())));
@ -64,7 +66,7 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
PriorityMultiLock::Lock lock = wait(f);
// Rotate to another priority
if (++p == priorities.size()) {
if (++p == active) {
p = 0;
}
@ -76,7 +78,6 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
benchState->SetItemsProcessed(static_cast<long>(benchState->iterations()));
delete pml;
return Void();
}
@ -84,4 +85,4 @@ static void bench_priorityMultiLock(benchmark::State& benchState) {
onMainThread([&benchState]() { return benchPriorityMultiLock(&benchState); }).blockUntilReady();
}
BENCHMARK(bench_priorityMultiLock)->DenseRange(1, 8)->ReportAggregatesOnly(true);
BENCHMARK(bench_priorityMultiLock)->Args({ 5, 0 })->Ranges({ { 1, 64 }, { 0, 128 } })->ReportAggregatesOnly(true);

View File

@ -334,9 +334,6 @@ logdir = {logdir}
db_config += " blob_granules_enabled:=1"
self.fdbcli_exec(db_config)
if self.blob_granules_enabled:
self.fdbcli_exec("blobrange start \\x00 \\xff")
# Generate and install test certificate chains and keys
def create_tls_cert(self):
assert self.tls_config is not None, "TLS not enabled"

View File

@ -6,6 +6,7 @@ enable_encryption = true
enable_tlog_encryption = true
enable_storage_server_encryption = false
enable_blob_granule_encryption = true
max_write_transaction_life_versions = 5000000
[[test]]
testTitle = 'EncryptedBackupAndRestore'

View File

@ -8,20 +8,36 @@ testTitle = 'TenantCreation'
[[test.workload]]
testName = 'CreateTenant'
name = 'First'
group = 'GroupA'
[[test.workload]]
testName = 'CreateTenant'
name = 'Second'
group = 'GroupA'
[[test.workload]]
testName = 'CreateTenant'
name = 'Third'
group = 'GroupB'
[[test.workload]]
testName = 'CreateTenant'
name = 'Fourth'
group = 'GroupB'
[[test]]
testTitle = 'StorageQuota'
[[test.workload]]
testName = 'StorageQuota'
group = 'GroupA'
tenant = 'First'
nodeCount = 250000
emptyTenant = 'Second'
[[test.workload]]
testName = 'StorageQuota'
tenant = 'Second'
group = 'GroupB'
tenant = 'Third'
nodeCount = 25000
emptyTenant = 'Fourth'