Merge branch 'main' of github.com:apple/foundationdb into tenant-list-filter

This commit is contained in:
Jon Fu 2022-11-14 12:30:28 -08:00
commit 25e1721e75
64 changed files with 1649 additions and 796 deletions

View File

@ -70,10 +70,13 @@ void ApiWorkload::start() {
schedule([this]() { schedule([this]() {
// 1. Clear data // 1. Clear data
clearData([this]() { clearData([this]() {
// 2. Populate initial data // 2. Workload setup
populateData([this]() { setup([this]() {
// 3. Generate random workload // 3. Populate initial data
runTests(); populateData([this]() {
// 4. Generate random workload
runTests();
});
}); });
}); });
}); });
@ -249,6 +252,10 @@ void ApiWorkload::populateData(TTaskFct cont) {
} }
} }
void ApiWorkload::setup(TTaskFct cont) {
schedule(cont);
}
void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) { void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction); int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<fdb::KeyValue>>(); auto kvPairs = std::make_shared<std::vector<fdb::KeyValue>>();
@ -322,4 +329,85 @@ std::optional<fdb::BytesRef> ApiWorkload::getTenant(std::optional<int> tenantId)
} }
} }
std::string ApiWorkload::debugTenantStr(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format("(tenant {0})", tenantId.value()) : "()";
}
// BlobGranule setup.
// This blobbifies ['\x00', '\xff') per tenant or for the whole database if there are no tenants.
void ApiWorkload::setupBlobGranules(TTaskFct cont) {
// This count is used to synchronize the # of tenant blobbifyRange() calls to ensure
// we only start the workload once blobbification has fully finished.
auto blobbifiedCount = std::make_shared<std::atomic<int>>(1);
if (tenants.empty()) {
blobbifiedCount->store(1);
blobbifyTenant({}, blobbifiedCount, cont);
} else {
blobbifiedCount->store(tenants.size());
for (int i = 0; i < tenants.size(); i++) {
schedule([=]() { blobbifyTenant(i, blobbifiedCount, cont); });
}
}
}
void ApiWorkload::blobbifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retBlobbifyRange = std::make_shared<bool>(false);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: blobbifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->blobbifyRange(begin, end).eraseType();
ctx->continueAfter(f, [ctx, retBlobbifyRange, f]() {
*retBlobbifyRange = f.get<fdb::future_var::Bool>();
ctx->done();
});
},
[=]() {
if (!*retBlobbifyRange) {
schedule([=]() { blobbifyTenant(tenantId, blobbifiedCount, cont); });
} else {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
void ApiWorkload::verifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retVerifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: verifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, /*latest_version*/ -2).eraseType();
ctx->continueAfter(f, [ctx, retVerifyVersion, f]() {
*retVerifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
});
},
[=]() {
if (*retVerifyVersion == -1) {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
} else {
if (blobbifiedCount->fetch_sub(1) == 1) {
schedule(cont);
}
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
} // namespace FdbApiTester } // namespace FdbApiTester

View File

@ -41,6 +41,9 @@ public:
virtual void checkProgress() override; virtual void checkProgress() override;
// Workload specific setup phase.
virtual void setup(TTaskFct cont);
// Running specific tests // Running specific tests
// The default implementation generates a workload consisting of // The default implementation generates a workload consisting of
// random operations generated by randomOperation // random operations generated by randomOperation
@ -126,6 +129,12 @@ protected:
void randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId); void randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId);
std::optional<fdb::BytesRef> getTenant(std::optional<int> tenantId); std::optional<fdb::BytesRef> getTenant(std::optional<int> tenantId);
std::string debugTenantStr(std::optional<int> tenantId);
// Generic BlobGranules setup.
void setupBlobGranules(TTaskFct cont);
void blobbifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
void verifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
private: private:
void populateDataTx(TTaskFct cont, std::optional<int> tenantId); void populateDataTx(TTaskFct cont, std::optional<int> tenantId);

View File

@ -52,26 +52,23 @@ private:
}; };
std::vector<OpType> excludedOpTypes; std::vector<OpType> excludedOpTypes;
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow // FIXME: should still guarantee a read succeeds eventually somehow
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess; std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); } inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); } inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) { void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) { if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}", info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
opName, opName,
fdb::toCharsRef(begin), fdb::toCharsRef(begin),
fdb::toCharsRef(end), fdb::toCharsRef(end),
tenantDebugString(tenantId), debugTenantStr(tenantId),
message)); message));
} }
} }
@ -117,7 +114,7 @@ private:
results.get()->assign(resVector.begin(), resVector.end()); results.get()->assign(resVector.begin(), resVector.end());
bool previousSuccess = seenReadSuccess(tenantId); bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) { if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId))); info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId); setReadSuccess(tenantId);
} else { } else {
debugOp("Read", begin, end, tenantId, "complete"); debugOp("Read", begin, end, tenantId, "complete");
@ -289,20 +286,19 @@ private:
} }
// TODO: tenant support // TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) { void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName(); fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName(); fdb::Key end = randomKeyName();
auto results = std::make_shared<std::vector<fdb::KeyRange>>(); auto results = std::make_shared<std::vector<fdb::KeyRange>>();
if (begin > end) { if (begin > end) {
std::swap(begin, end); std::swap(begin, end);
} }
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting"); debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation( execOperation(
[begin, end, results](auto ctx) { [begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType(); fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
ctx->continueAfter(f, [ctx, f, results]() { ctx->continueAfter(f, [ctx, f, results]() {
*results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>()); *results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
ctx->done(); ctx->done();
@ -314,25 +310,24 @@ private:
this->validateRanges(results, begin, end, seenReadSuccess(tenantId)); this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont); schedule(cont);
}, },
getTenant(tenantId),
/* failOnError = */ false); /* failOnError = */ false);
} }
// TODO: tenant support // TODO: tenant support
void randomVerifyOp(TTaskFct cont) { void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName(); fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName(); fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) { if (begin > end) {
std::swap(begin, end); std::swap(begin, end);
} }
auto verifyVersion = std::make_shared<int64_t>(false);
debugOp("Verify", begin, end, tenantId, "starting"); debugOp("Verify", begin, end, tenantId, "starting");
auto verifyVersion = std::make_shared<int64_t>(-1);
execOperation( execOperation(
[begin, end, verifyVersion](auto ctx) { [begin, end, verifyVersion](auto ctx) {
fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType(); fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
ctx->continueAfter(f, [ctx, verifyVersion, f]() { ctx->continueAfter(f, [ctx, verifyVersion, f]() {
*verifyVersion = f.get<fdb::future_var::Int64>(); *verifyVersion = f.get<fdb::future_var::Int64>();
ctx->done(); ctx->done();
@ -344,15 +339,16 @@ private:
if (*verifyVersion == -1) { if (*verifyVersion == -1) {
ASSERT(!previousSuccess); ASSERT(!previousSuccess);
} else if (!previousSuccess) { } else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId))); info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId); setReadSuccess(tenantId);
} }
schedule(cont); schedule(cont);
}, },
getTenant(tenantId),
/* failOnError = */ false); /* failOnError = */ false);
} }
void randomOperation(TTaskFct cont) { void randomOperation(TTaskFct cont) override {
std::optional<int> tenantId = randomTenant(); std::optional<int> tenantId = randomTenant();
OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST); OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
@ -380,10 +376,10 @@ private:
randomSummarizeOp(cont, tenantId); randomSummarizeOp(cont, tenantId);
break; break;
case OP_GET_BLOB_RANGES: case OP_GET_BLOB_RANGES:
randomGetBlobRangesOp(cont); randomGetBlobRangesOp(cont, tenantId);
break; break;
case OP_VERIFY: case OP_VERIFY:
randomVerifyOp(cont); randomVerifyOp(cont, tenantId);
break; break;
} }
} }

View File

@ -47,6 +47,8 @@ private:
OP_LAST = OP_CANCEL_PURGE OP_LAST = OP_CANCEL_PURGE
}; };
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// could add summarize too old and verify too old as ops if desired but those are lower value // could add summarize too old and verify too old as ops if desired but those are lower value
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet // Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet

View File

@ -91,13 +91,15 @@ public:
fdbDb = executor->selectDatabase(); fdbDb = executor->selectDatabase();
} }
if (tenantName) {
fdbTenant = fdbDb.openTenant(*tenantName);
fdbDbOps = std::make_shared<fdb::Tenant>(fdbTenant);
} else {
fdbDbOps = std::make_shared<fdb::Database>(fdbDb);
}
if (transactional) { if (transactional) {
if (tenantName) { fdbTx = fdbDbOps->createTransaction();
fdb::Tenant tenant = fdbDb.openTenant(*tenantName);
fdbTx = tenant.createTransaction();
} else {
fdbTx = fdbDb.createTransaction();
}
} }
} }
@ -109,6 +111,10 @@ public:
fdb::Database db() override { return fdbDb.atomic_load(); } fdb::Database db() override { return fdbDb.atomic_load(); }
fdb::Tenant tenant() override { return fdbTenant.atomic_load(); }
std::shared_ptr<fdb::IDatabaseOps> dbOps() override { return std::atomic_load(&fdbDbOps); }
fdb::Transaction tx() override { return fdbTx.atomic_load(); } fdb::Transaction tx() override { return fdbTx.atomic_load(); }
// Set a continuation to be executed when a future gets ready // Set a continuation to be executed when a future gets ready
@ -272,13 +278,17 @@ protected:
scheduler->schedule([thisRef]() { scheduler->schedule([thisRef]() {
fdb::Database db = thisRef->executor->selectDatabase(); fdb::Database db = thisRef->executor->selectDatabase();
thisRef->fdbDb.atomic_store(db); thisRef->fdbDb.atomic_store(db);
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTenant.atomic_store(tenant);
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Tenant>(tenant)));
} else {
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Database>(db)));
}
if (thisRef->transactional) { if (thisRef->transactional) {
if (thisRef->tenantName) { thisRef->fdbTx.atomic_store(thisRef->fdbDbOps->createTransaction());
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTx.atomic_store(tenant.createTransaction());
} else {
thisRef->fdbTx.atomic_store(db.createTransaction());
}
} }
thisRef->restartTransaction(); thisRef->restartTransaction();
}); });
@ -317,6 +327,14 @@ protected:
// Provides a thread safe interface by itself (no need for mutex) // Provides a thread safe interface by itself (no need for mutex)
fdb::Database fdbDb; fdb::Database fdbDb;
// FDB tenant
// Provides a thread safe interface by itself (no need for mutex)
fdb::Tenant fdbTenant;
// FDB IDatabaseOps to hide database/tenant accordingly.
// Provides a shared pointer to database functions based on if db or tenant.
std::shared_ptr<fdb::IDatabaseOps> fdbDbOps;
// FDB transaction // FDB transaction
// Provides a thread safe interface by itself (no need for mutex) // Provides a thread safe interface by itself (no need for mutex)
fdb::Transaction fdbTx; fdb::Transaction fdbTx;

View File

@ -41,6 +41,12 @@ public:
// Current FDB database // Current FDB database
virtual fdb::Database db() = 0; virtual fdb::Database db() = 0;
// Current FDB tenant
virtual fdb::Tenant tenant() = 0;
// Current FDB IDatabaseOps
virtual std::shared_ptr<fdb::IDatabaseOps> dbOps() = 0;
// Current FDB transaction // Current FDB transaction
virtual fdb::Transaction tx() = 0; virtual fdb::Transaction tx() = 0;

View File

@ -117,8 +117,11 @@ void WorkloadBase::execTransaction(TOpStartFct startFct,
} }
// Execute a non-transactional database operation within the workload // Execute a non-transactional database operation within the workload
void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) { void WorkloadBase::execOperation(TOpStartFct startFct,
doExecute(startFct, cont, {}, failOnError, false); TTaskFct cont,
std::optional<fdb::BytesRef> tenant,
bool failOnError) {
doExecute(startFct, cont, tenant, failOnError, false);
} }
void WorkloadBase::doExecute(TOpStartFct startFct, void WorkloadBase::doExecute(TOpStartFct startFct,

View File

@ -125,7 +125,10 @@ protected:
bool failOnError = true); bool failOnError = true);
// Execute a non-transactional database operation within the workload // Execute a non-transactional database operation within the workload
void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true); void execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
bool failOnError = true);
// Log an error message, increase error counter // Log an error message, increase error counter
void error(const std::string& msg); void error(const std::string& msg);

View File

@ -677,7 +677,28 @@ public:
} }
}; };
class Tenant final { // Handle this as an abstract class instead of interface to preserve lifetime of fdb objects owned by Tenant and
// Database.
class IDatabaseOps {
public:
virtual ~IDatabaseOps() = default;
virtual Transaction createTransaction() = 0;
virtual TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin,
KeyRef end,
int rangeLimit) = 0;
virtual TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) = 0;
virtual TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin,
KeyRef end,
int64_t version,
bool force) = 0;
virtual TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) = 0;
};
class Tenant final : public IDatabaseOps {
friend class Database; friend class Database;
std::shared_ptr<native::FDBTenant> tenant; std::shared_ptr<native::FDBTenant> tenant;
@ -694,6 +715,14 @@ public:
Tenant& operator=(const Tenant&) noexcept = default; Tenant& operator=(const Tenant&) noexcept = default;
Tenant() noexcept : tenant(nullptr) {} Tenant() noexcept : tenant(nullptr) {}
void atomic_store(Tenant other) { std::atomic_store(&tenant, other.tenant); }
Tenant atomic_load() {
Tenant retVal;
retVal.tenant = std::atomic_load(&tenant);
return retVal;
}
static void createTenant(Transaction tr, BytesRef name) { static void createTenant(Transaction tr, BytesRef name) {
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef()); tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef());
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef()); tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef());
@ -715,7 +744,7 @@ public:
return tr.get(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), false); return tr.get(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), false);
} }
Transaction createTransaction() { Transaction createTransaction() override {
auto tx_native = static_cast<native::FDBTransaction*>(nullptr); auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
auto err = Error(native::fdb_tenant_create_transaction(tenant.get(), &tx_native)); auto err = Error(native::fdb_tenant_create_transaction(tenant.get(), &tx_native));
if (err) if (err)
@ -723,14 +752,49 @@ public:
return Transaction(tx_native); return Transaction(tx_native);
} }
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) { TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant) if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant"); throw std::runtime_error("blobbifyRange() from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end)); return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
} }
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("unblobbifyRange() from null tenant");
return native::fdb_tenant_unblobbify_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!tenant)
throw std::runtime_error("listBlobbifiedRanges() from null tenant");
return native::fdb_tenant_list_blobbified_ranges(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!tenant)
throw std::runtime_error("verifyBlobRange() from null tenant");
return native::fdb_tenant_verify_blob_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!tenant)
throw std::runtime_error("purgeBlobGranules() from null tenant");
native::fdb_bool_t forceBool = force;
return native::fdb_tenant_purge_blob_granules(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!tenant)
throw std::runtime_error("waitPurgeGranulesComplete() from null tenant");
return native::fdb_tenant_wait_purge_granules_complete(tenant.get(), purgeKey.data(), intSize(purgeKey));
}
}; };
class Database { class Database : public IDatabaseOps {
friend class Tenant; friend class Tenant;
std::shared_ptr<native::FDBDatabase> db; std::shared_ptr<native::FDBDatabase> db;
@ -789,7 +853,7 @@ public:
return Tenant(tenant_native); return Tenant(tenant_native);
} }
Transaction createTransaction() { Transaction createTransaction() override {
if (!db) if (!db)
throw std::runtime_error("create_transaction from null database"); throw std::runtime_error("create_transaction from null database");
auto tx_native = static_cast<native::FDBTransaction*>(nullptr); auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
@ -799,33 +863,33 @@ public:
return Transaction(tx_native); return Transaction(tx_native);
} }
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) { TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!db) if (!db)
throw std::runtime_error("listBlobbifiedRanges from null database"); throw std::runtime_error("listBlobbifiedRanges from null database");
return native::fdb_database_list_blobbified_ranges( return native::fdb_database_list_blobbified_ranges(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit); db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
} }
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) { TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!db) if (!db)
throw std::runtime_error("verifyBlobRange from null database"); throw std::runtime_error("verifyBlobRange from null database");
return native::fdb_database_verify_blob_range( return native::fdb_database_verify_blob_range(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version); db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
} }
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) { TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!db) if (!db)
throw std::runtime_error("blobbifyRange from null database"); throw std::runtime_error("blobbifyRange from null database");
return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
} }
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) { TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!db) if (!db)
throw std::runtime_error("unblobbifyRange from null database"); throw std::runtime_error("unblobbifyRange from null database");
return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end)); return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
} }
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) { TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!db) if (!db)
throw std::runtime_error("purgeBlobGranules from null database"); throw std::runtime_error("purgeBlobGranules from null database");
native::fdb_bool_t forceBool = force; native::fdb_bool_t forceBool = force;
@ -833,7 +897,7 @@ public:
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool); db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
} }
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) { TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!db) if (!db)
throw std::runtime_error("purgeBlobGranules from null database"); throw std::runtime_error("purgeBlobGranules from null database");
return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey)); return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey));

View File

@ -1,12 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from argparse import ArgumentParser, RawDescriptionHelpFormatter import argparse
from pathlib import Path from pathlib import Path
import platform
import shutil import shutil
import subprocess import subprocess
import sys import sys
import os import os
import glob import glob
import unittest
sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "tests", "TestRunner")] sys.path[:0] = [os.path.join(os.path.dirname(__file__), "..", "..", "..", "tests", "TestRunner")]
@ -18,6 +18,9 @@ from local_cluster import LocalCluster, random_secret_string
PREV_RELEASE_VERSION = "7.1.5" PREV_RELEASE_VERSION = "7.1.5"
PREV_PREV_RELEASE_VERSION = "7.0.0" PREV_PREV_RELEASE_VERSION = "7.0.0"
args = None
downloader = None
def version_from_str(ver_str): def version_from_str(ver_str):
ver = [int(s) for s in ver_str.split(".")] ver = [int(s) for s in ver_str.split(".")]
@ -30,11 +33,9 @@ def api_version_from_str(ver_str):
return ver_tuple[0] * 100 + ver_tuple[1] * 10 return ver_tuple[0] * 100 + ver_tuple[1] * 10
class TestEnv(LocalCluster): class TestCluster(LocalCluster):
def __init__( def __init__(
self, self,
args,
downloader: FdbBinaryDownloader,
version: str, version: str,
): ):
self.client_config_tester_bin = Path(args.client_config_tester_bin).resolve() self.client_config_tester_bin = Path(args.client_config_tester_bin).resolve()
@ -44,35 +45,33 @@ class TestEnv(LocalCluster):
assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir) assert self.build_dir.is_dir(), "{} is not a directory".format(args.build_dir)
self.tmp_dir = self.build_dir.joinpath("tmp", random_secret_string(16)) self.tmp_dir = self.build_dir.joinpath("tmp", random_secret_string(16))
self.tmp_dir.mkdir(parents=True) self.tmp_dir.mkdir(parents=True)
self.downloader = downloader
self.version = version self.version = version
super().__init__( super().__init__(
self.tmp_dir, self.tmp_dir,
self.downloader.binary_path(version, "fdbserver"), downloader.binary_path(version, "fdbserver"),
self.downloader.binary_path(version, "fdbmonitor"), downloader.binary_path(version, "fdbmonitor"),
self.downloader.binary_path(version, "fdbcli"), downloader.binary_path(version, "fdbcli"),
1, 1,
) )
self.set_env_var("LD_LIBRARY_PATH", self.downloader.lib_dir(version)) self.set_env_var("LD_LIBRARY_PATH", downloader.lib_dir(version))
self.failed_cnt = 0
def __enter__(self): def setup(self):
super().__enter__() self.__enter__()
super().create_database() self.create_database()
return self
def __exit__(self, xc_type, exc_value, traceback): def tearDown(self):
super().__exit__(xc_type, exc_value, traceback) self.__exit__(None, None, None)
shutil.rmtree(self.tmp_dir) shutil.rmtree(self.tmp_dir)
# Client configuration tests using a cluster of the current version
class ClientConfigTest: class ClientConfigTest:
def __init__(self, test_env: TestEnv, title: str): def __init__(self, tc: unittest.TestCase):
self.test_env = test_env self.tc = tc
self.title = title self.cluster = tc.cluster
self.external_lib_dir = None self.external_lib_dir = None
self.external_lib_path = None self.external_lib_path = None
self.test_dir = self.test_env.tmp_dir.joinpath(random_secret_string(16)) self.test_dir = self.cluster.tmp_dir.joinpath(random_secret_string(16))
self.test_dir.mkdir(parents=True) self.test_dir.mkdir(parents=True)
self.log_dir = self.test_dir.joinpath("log") self.log_dir = self.test_dir.joinpath("log")
self.log_dir.mkdir(parents=True) self.log_dir.mkdir(parents=True)
@ -88,31 +87,28 @@ class ClientConfigTest:
self.external_lib_dir = self.test_dir.joinpath("extclients") self.external_lib_dir = self.test_dir.joinpath("extclients")
self.external_lib_dir.mkdir(parents=True) self.external_lib_dir.mkdir(parents=True)
for version in versions: for version in versions:
src_file_path = self.test_env.downloader.lib_path(version) src_file_path = downloader.lib_path(version)
assert src_file_path.exists(), "{} does not exist".format(src_file_path) self.tc.assertTrue(src_file_path.exists(), "{} does not exist".format(src_file_path))
target_file_path = self.external_lib_dir.joinpath("libfdb_c.{}.so".format(version)) target_file_path = self.external_lib_dir.joinpath("libfdb_c.{}.so".format(version))
shutil.copyfile(src_file_path, target_file_path) shutil.copyfile(src_file_path, target_file_path)
assert target_file_path.exists(), "{} does not exist".format(target_file_path) self.tc.assertTrue(target_file_path.exists(), "{} does not exist".format(target_file_path))
def create_external_lib_path(self, version): def create_external_lib_path(self, version):
src_file_path = self.test_env.downloader.lib_path(version) src_file_path = downloader.lib_path(version)
assert src_file_path.exists(), "{} does not exist".format(src_file_path) self.tc.assertTrue(src_file_path.exists(), "{} does not exist".format(src_file_path))
self.external_lib_path = self.test_dir.joinpath("libfdb_c.{}.so".format(version)) self.external_lib_path = self.test_dir.joinpath("libfdb_c.{}.so".format(version))
shutil.copyfile(src_file_path, self.external_lib_path) shutil.copyfile(src_file_path, self.external_lib_path)
assert self.external_lib_path.exists(), "{} does not exist".format(self.external_lib_path) self.tc.assertTrue(self.external_lib_path.exists(), "{} does not exist".format(self.external_lib_path))
def dump_client_logs(self): def dump_client_logs(self):
for log_file in glob.glob(os.path.join(self.log_dir, "*")): for log_file in glob.glob(os.path.join(self.log_dir, "*")):
print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(log_file)) print(">>>>>>>>>>>>>>>>>>>> Contents of {}:".format(log_file), file=sys.stderr)
with open(log_file, "r") as f: with open(log_file, "r") as f:
print(f.read()) print(f.read(), file=sys.stderr)
print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file)) print(">>>>>>>>>>>>>>>>>>>> End of {}:".format(log_file), file=sys.stderr)
def exec(self): def exec(self):
print("-" * 80) cmd_args = [self.cluster.client_config_tester_bin, "--cluster-file", self.cluster.cluster_file]
print(self.title)
print("-" * 80)
cmd_args = [self.test_env.client_config_tester_bin, "--cluster-file", self.test_env.cluster_file]
if self.tmp_dir is not None: if self.tmp_dir is not None:
cmd_args += ["--tmp-dir", self.tmp_dir] cmd_args += ["--tmp-dir", self.tmp_dir]
@ -141,61 +137,66 @@ class ClientConfigTest:
if self.transaction_timeout is not None: if self.transaction_timeout is not None:
cmd_args += ["--transaction-timeout", str(self.transaction_timeout)] cmd_args += ["--transaction-timeout", str(self.transaction_timeout)]
print("Executing test command: {}".format(" ".join([str(c) for c in cmd_args]))) print("\nExecuting test command: {}".format(" ".join([str(c) for c in cmd_args])), file=sys.stderr)
tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr) try:
tester_retcode = tester_proc.wait() tester_proc = subprocess.Popen(cmd_args, stdout=sys.stdout, stderr=sys.stderr)
if tester_retcode != 0: tester_retcode = tester_proc.wait()
print("Test '{}' failed".format(self.title)) self.tc.assertEqual(0, tester_retcode)
self.test_env.failed_cnt += 1 finally:
self.cleanup()
self.cleanup()
def cleanup(self): def cleanup(self):
shutil.rmtree(self.test_dir) shutil.rmtree(self.test_dir)
class ClientConfigTests: class ClientConfigTests(unittest.TestCase):
def __init__(self, args): @classmethod
self.args = args def setUpClass(cls):
self.downloader = FdbBinaryDownloader(args.build_dir) cls.cluster = TestCluster(CURRENT_VERSION)
# binary downloads are currently available only for x86_64 cls.cluster.setup()
self.platform = platform.machine()
if self.platform == "x86_64":
self.downloader.download_old_binaries(PREV_RELEASE_VERSION)
self.downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION)
def test_local_client_only(self, test_env): @classmethod
test = ClientConfigTest(test_env, "Local client only") def tearDownClass(cls):
cls.cluster.tearDown()
def test_local_client_only(self):
# Local client only
test = ClientConfigTest(self)
test.exec() test.exec()
def test_single_external_client_only(self, test_env): def test_single_external_client_only(self):
test = ClientConfigTest(test_env, "Single external client") # Single external client only
test = ClientConfigTest(self)
test.create_external_lib_path(CURRENT_VERSION) test.create_external_lib_path(CURRENT_VERSION)
test.disable_local_client = True test.disable_local_client = True
test.exec() test.exec()
def test_same_local_and_external_client(self, test_env): def test_same_local_and_external_client(self):
test = ClientConfigTest(test_env, "Same Local & External Client") # Same version local & external client
test = ClientConfigTest(self)
test.create_external_lib_path(CURRENT_VERSION) test.create_external_lib_path(CURRENT_VERSION)
test.exec() test.exec()
def test_multiple_external_clients(self, test_env): def test_multiple_external_clients(self):
test = ClientConfigTest(test_env, "Multiple external clients") # Multiple external clients, normal case
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True test.disable_local_client = True
test.api_version = api_version_from_str(PREV_PREV_RELEASE_VERSION) test.api_version = api_version_from_str(PREV_PREV_RELEASE_VERSION)
test.exec() test.exec()
def test_no_external_client_support_api_version(self, test_env): def test_no_external_client_support_api_version(self):
test = ClientConfigTest(test_env, "Multiple external clients; API version supported by none") # Multiple external clients, API version supported by none of them
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION]) test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION) test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing test.expected_error = 2204 # API function missing
test.exec() test.exec()
def test_no_external_client_support_api_version_ignore(self, test_env): def test_no_external_client_support_api_version_ignore(self):
test = ClientConfigTest(test_env, "Multiple external clients; API version supported by none; Ignore failures") # Multiple external clients; API version supported by none of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION]) test.create_external_lib_dir([PREV_PREV_RELEASE_VERSION, PREV_RELEASE_VERSION])
test.disable_local_client = True test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION) test.api_version = api_version_from_str(CURRENT_VERSION)
@ -203,79 +204,66 @@ class ClientConfigTests:
test.expected_error = 2124 # All external clients failed test.expected_error = 2124 # All external clients failed
test.exec() test.exec()
def test_one_external_client_wrong_api_version(self, test_env): def test_one_external_client_wrong_api_version(self):
test = ClientConfigTest(test_env, "Multiple external clients: API version unsupported by one") # Multiple external clients, API version unsupported by one of othem
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION) test.api_version = api_version_from_str(CURRENT_VERSION)
test.expected_error = 2204 # API function missing test.expected_error = 2204 # API function missing
test.exec() test.exec()
def test_one_external_client_wrong_api_version_ignore(self, test_env): def test_one_external_client_wrong_api_version_ignore(self):
test = ClientConfigTest(test_env, "Multiple external clients; API version unsupported by one; Ignore failures") # Multiple external clients; API version unsupported by one of them; Ignore failures
test = ClientConfigTest(self)
test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION]) test.create_external_lib_dir([CURRENT_VERSION, PREV_RELEASE_VERSION, PREV_PREV_RELEASE_VERSION])
test.disable_local_client = True test.disable_local_client = True
test.api_version = api_version_from_str(CURRENT_VERSION) test.api_version = api_version_from_str(CURRENT_VERSION)
test.ignore_external_client_failures = True test.ignore_external_client_failures = True
test.exec() test.exec()
def test_prev_release_with_ext_client(self, test_env):
test = ClientConfigTest(test_env, "Cluster with previous release version") # Client configuration tests using a cluster of previous release version
class ClientConfigPrevVersionTests(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.cluster = TestCluster(PREV_RELEASE_VERSION)
cls.cluster.setup()
@classmethod
def tearDownClass(cls):
cls.cluster.tearDown()
def test_external_client(self):
# Using an external client to connect
test = ClientConfigTest(self)
test.create_external_lib_path(PREV_RELEASE_VERSION) test.create_external_lib_path(PREV_RELEASE_VERSION)
test.api_version = api_version_from_str(PREV_RELEASE_VERSION) test.api_version = api_version_from_str(PREV_RELEASE_VERSION)
test.exec() test.exec()
def test_prev_release_with_ext_client_unsupported_api(self, test_env): def test_prev_release_with_ext_client_unsupported_api(self):
test = ClientConfigTest(test_env, "Cluster with previous release version; Unsupported API version") # Leaving an unsupported API version
test = ClientConfigTest(self)
test.create_external_lib_path(PREV_RELEASE_VERSION) test.create_external_lib_path(PREV_RELEASE_VERSION)
test.expected_error = 2204 # API function missing test.expected_error = 2204 # API function missing
test.exec() test.exec()
def test_prev_release_with_ext_client_unsupported_api_ignore(self, test_env): def test_prev_release_with_ext_client_unsupported_api_ignore(self):
test = ClientConfigTest( # Leaving an unsupported API version, ignore failures
test_env, "Cluster with previous release version; Unsupported API version; Ignore failures" test = ClientConfigTest(self)
)
test.create_external_lib_path(PREV_RELEASE_VERSION) test.create_external_lib_path(PREV_RELEASE_VERSION)
test.transaction_timeout = 100 test.transaction_timeout = 100
test.expected_error = 1031 # Timeout test.expected_error = 1031 # Timeout
test.ignore_external_client_failures = True test.ignore_external_client_failures = True
test.exec() test.exec()
def run_tests(self):
failed_cnt = 0
with TestEnv(self.args, self.downloader, CURRENT_VERSION) as test_env:
self.test_local_client_only(test_env)
self.test_single_external_client_only(test_env)
self.test_same_local_and_external_client(test_env)
self.test_multiple_external_clients(test_env)
self.test_no_external_client_support_api_version(test_env)
self.test_no_external_client_support_api_version_ignore(test_env)
self.test_one_external_client_wrong_api_version(test_env)
self.test_one_external_client_wrong_api_version_ignore(test_env)
failed_cnt += test_env.failed_cnt
if self.platform == "x86_64":
with TestEnv(self.args, self.downloader, PREV_RELEASE_VERSION) as test_env:
self.test_prev_release_with_ext_client(test_env)
self.test_prev_release_with_ext_client_unsupported_api(test_env)
self.test_prev_release_with_ext_client_unsupported_api_ignore(test_env)
failed_cnt += test_env.failed_cnt
if failed_cnt > 0:
print("{} tests failed".format(failed_cnt))
else:
print("All tests successful")
return failed_cnt
if __name__ == "__main__": if __name__ == "__main__":
parser = ArgumentParser( parser = argparse.ArgumentParser(
formatter_class=RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
description=""" description="""
A script for testing FDB multi-version client in upgrade scenarios. Creates a local cluster, Unit tests for running FDB client with different configurations.
generates a workload using fdb_c_api_tester with a specified test file, and performs Also accepts python unit tests command line arguments.
cluster upgrade according to the specified upgrade path. Checks if the workload successfully
progresses after each upgrade step.
""", """,
) )
parser.add_argument( parser.add_argument(
@ -291,7 +279,13 @@ if __name__ == "__main__":
help="Path to the fdb_c_client_config_tester executable.", help="Path to the fdb_c_client_config_tester executable.",
required=True, required=True,
) )
parser.add_argument("unittest_args", nargs=argparse.REMAINDER)
args = parser.parse_args() args = parser.parse_args()
test = ClientConfigTests(args) sys.argv[1:] = args.unittest_args
failed_cnt = test.run_tests()
sys.exit(failed_cnt) downloader = FdbBinaryDownloader(args.build_dir)
downloader.download_old_binaries(PREV_RELEASE_VERSION)
downloader.download_old_binaries(PREV_PREV_RELEASE_VERSION)
unittest.main(verbosity=2)

View File

@ -0,0 +1,47 @@
/*
* BlobRestoreCommand.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/FDBOptions.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace fdb_cli {
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens) {
if (tokens.size() != 1 && tokens.size() != 2) {
printUsage(tokens[0]);
return false;
}
state bool success = false;
wait(store(success, localDb->blobRestore(normalKeys)));
if (success) {
fmt::print("Started blob restore for the full cluster. Please use 'status' command to check progress.\n");
} else {
fmt::print("Fail to start a new blob restore while there is a pending one.\n");
}
return success;
}
CommandFactory blobRestoreFactory("blobrestore", CommandHelp("blobrestore", "", ""));
} // namespace fdb_cli

View File

@ -1416,6 +1416,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
continue; continue;
} }
if (tokencmp(tokens[0], "blobrestore")) {
bool _result = wait(makeInterruptable(blobRestoreCommandActor(localDb, tokens)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "unlock")) { if (tokencmp(tokens[0], "unlock")) {
if ((tokens.size() != 2) || (tokens[1].size() != 32) || if ((tokens.size() != 2) || (tokens[1].size() != 32) ||
!std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) { !std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) {

View File

@ -213,6 +213,9 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
ACTOR Future<bool> blobKeyCommandActor(Database localDb, ACTOR Future<bool> blobKeyCommandActor(Database localDb,
Optional<TenantMapEntry> tenantEntry, Optional<TenantMapEntry> tenantEntry,
std::vector<StringRef> tokens); std::vector<StringRef> tokens);
// blobrestore command
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens);
// maintenance command // maintenance command
ACTOR Future<bool> setHealthyZone(Reference<IDatabase> db, StringRef zoneId, double seconds, bool printWarning = false); ACTOR Future<bool> setHealthyZone(Reference<IDatabase> db, StringRef zoneId, double seconds, bool printWarning = false);
ACTOR Future<bool> clearHealthyZone(Reference<IDatabase> db, ACTOR Future<bool> clearHealthyZone(Reference<IDatabase> db,

View File

@ -45,7 +45,12 @@ def run_fdbcli_command(*args):
string: Console output from fdbcli string: Console output from fdbcli
""" """
commands = command_template + ["{}".format(' '.join(args))] commands = command_template + ["{}".format(' '.join(args))]
return subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env).stdout.decode('utf-8').strip() try:
# if the fdbcli command is stuck for more than 20 seconds, the database is definitely unavailable
process = subprocess.run(commands, stdout=subprocess.PIPE, env=fdbcli_env, timeout=20)
return process.stdout.decode('utf-8').strip()
except subprocess.TimeoutExpired:
raise Exception('The fdbcli command is stuck, database is unavailable')
def run_fdbcli_command_and_get_error(*args): def run_fdbcli_command_and_get_error(*args):
@ -1079,16 +1084,19 @@ if __name__ == '__main__':
lockAndUnlock() lockAndUnlock()
maintenance() maintenance()
profile() profile()
suspend() # TODO: reenable it until it's stable
# suspend()
transaction() transaction()
throttle() # this is replaced by the "quota" command
#throttle()
triggerddteaminfolog() triggerddteaminfolog()
tenants() tenants()
versionepoch() versionepoch()
integer_options() integer_options()
tls_address_suffix() tls_address_suffix()
knobmanagement() knobmanagement()
quota() # TODO: fix the issue when running through the external client
#quota()
else: else:
assert args.process_number > 1, "Process number should be positive" assert args.process_number > 1, "Process number should be positive"
coordinators() coordinators()

View File

@ -971,6 +971,11 @@ void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion,
// clearVersion as previous guy) // clearVersion as previous guy)
} }
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange) {
SortedDeltasT deltasByKey;
sortDeltasByKey(deltasByVersion, fileRange, deltasByKey);
}
// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking // FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking
Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef, Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
const Standalone<GranuleDeltas>& deltas, const Standalone<GranuleDeltas>& deltas,

View File

@ -5924,7 +5924,6 @@ public:
printf("Restoring backup to version: %lld\n", (long long)targetVersion); printf("Restoring backup to version: %lld\n", (long long)targetVersion);
} }
state int retryCount = 0;
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx)); state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop { loop {
try { try {
@ -5948,17 +5947,9 @@ public:
wait(tr->commit()); wait(tr->commit());
break; break;
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_transaction_too_old) {
retryCount++;
}
if (e.code() == error_code_restore_duplicate_tag) { if (e.code() == error_code_restore_duplicate_tag) {
throw; throw;
} }
if (g_network->isSimulated() && retryCount > 50) {
CODE_PROBE(true, "submitRestore simulation speedup");
// try to make the read window back to normal size (5 * version_per_sec)
g_simulator->speedUpSimulation = true;
}
wait(tr->onError(e)); wait(tr->onError(e));
} }
} }

View File

@ -2559,15 +2559,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
} }
} }
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) { void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName); auto key = storageQuotaKey(tenantGroupName);
tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned())); tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
} }
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) { void clearStorageQuota(Transaction& tr, StringRef tenantGroupName) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantGroupName);
tr.clear(key);
}
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName))); state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantGroupName)));
if (!v.present()) { if (!v.present()) {
return Optional<int64_t>(); return Optional<int64_t>();
} }

View File

@ -4524,9 +4524,11 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
output.readToBegin = readToBegin; output.readToBegin = readToBegin;
output.readThroughEnd = readThroughEnd; output.readThroughEnd = readThroughEnd;
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) { if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows) &&
(!std::is_same<GetKeyValuesFamilyRequest, GetMappedKeyValuesRequest>::value)) {
// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only // Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
// happens in simulation so it's fine // happens in simulation so it's fine
// disable it on prefetch, because boundary entries serve as continuations
RangeResultFamily copy; RangeResultFamily copy;
int newSize = int newSize =
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size()); deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
@ -10915,6 +10917,37 @@ Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName); return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
} }
ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange range) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr->get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (status.progress < 100) {
return false; // stop if there is in-progress restore.
}
}
Standalone<BlobRestoreStatus> status;
status.progress = 0;
Value newValue = blobRestoreCommandValueFor(status);
tr->set(key, newValue);
wait(tr->commit());
return true;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
Future<bool> DatabaseContext::blobRestore(KeyRange range) {
return blobRestoreActor(Reference<DatabaseContext>::addRef(this), range);
}
int64_t getMaxKeySize(KeyRef const& key) { int64_t getMaxKeySize(KeyRef const& key) {
return getMaxWriteKeySize(key, true); return getMaxWriteKeySize(key, true);
} }

View File

@ -296,7 +296,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 ); init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip(); init( DD_TENANT_AWARENESS_ENABLED, false );
init( STORAGE_QUOTA_ENABLED, false ); if(isSimulated) STORAGE_QUOTA_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -387,7 +388,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_BACKGROUND_PARALLELISM, 4 ); init( ROCKSDB_BACKGROUND_PARALLELISM, 4 );
init( ROCKSDB_READ_PARALLELISM, 4 ); init( ROCKSDB_READ_PARALLELISM, 4 );
// If true, do not process and store RocksDB logs // If true, do not process and store RocksDB logs
init( ROCKSDB_MUTE_LOGS, false ); init( ROCKSDB_MUTE_LOGS, true );
// Use a smaller memtable in simulation to avoid OOMs. // Use a smaller memtable in simulation to avoid OOMs.
int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024; int64_t memtableBytes = isSimulated ? 32 * 1024 : 512 * 1024 * 1024;
init( ROCKSDB_MEMTABLE_BYTES, memtableBytes ); init( ROCKSDB_MEMTABLE_BYTES, memtableBytes );
@ -809,18 +810,24 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1; init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1; init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1; init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true; init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip();
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 ); init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true ); init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true ); init( QUICK_GET_KEY_VALUES_FALLBACK, true );
init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100); init( STRICTLY_ENFORCE_BYTE_LIMIT, false); if( randomize && BUGGIFY ) STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
init( FRACTION_INDEX_BYTELIMIT_PREFETCH, 0.2); if( randomize && BUGGIFY ) FRACTION_INDEX_BYTELIMIT_PREFETCH = 0.01 + deterministicRandom()->random01();
init( MAX_PARALLEL_QUICK_GET_VALUE, 10 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 ); init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 ); init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 ); init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
// Read priority definitions in the form of a list of their relative concurrency share weights
init( STORAGESERVER_READ_PRIORITIES, "120,10,20,40,60" );
// The total concurrency which will be shared by active priorities according to their relative weights
init( STORAGE_SERVER_READ_CONCURRENCY, 70 ); init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order // The priority number which each ReadType maps to in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" ); // This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" ); // The enumeration is currently: eager, fetch, low, normal, high
init( STORAGESERVER_READTYPE_PRIORITY_MAP, "0,1,2,3,4" );
//Wait Failure //Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2; init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -944,7 +951,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 ); init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; } init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); } init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" ); init( REDWOOD_IO_PRIORITIES, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false ); init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement // Server request latency measurement

View File

@ -1660,11 +1660,41 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) {
return interf; return interf;
} }
const KeyRangeRef blobRestoreCommandKeys("\xff\x02/blobRestoreCommand/"_sr, "\xff\x02/blobRestoreCommand0"_sr);
const Value blobRestoreCommandKeyFor(const KeyRangeRef range) {
BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule()));
wr.serializeBytes(blobRestoreCommandKeys.begin);
wr << range;
return wr.toValue();
}
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key) {
KeyRange range;
BinaryReader reader(key.removePrefix(blobRestoreCommandKeys.begin),
AssumeVersion(ProtocolVersion::withBlobGranule()));
reader >> range;
return range;
}
const Value blobRestoreCommandValueFor(BlobRestoreStatus status) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << status;
return wr.toValue();
}
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value) {
Standalone<BlobRestoreStatus> status;
BinaryReader reader(value, IncludeVersion());
reader >> status;
return status;
}
const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr); const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr);
const KeyRef storageQuotaPrefix = storageQuotaKeys.begin; const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;
Key storageQuotaKey(StringRef tenantName) { Key storageQuotaKey(StringRef tenantGroupName) {
return tenantName.withPrefix(storageQuotaPrefix); return tenantGroupName.withPrefix(storageQuotaPrefix);
} }
const KeyRangeRef idempotencyIdKeys("\xff\x02/idmp/"_sr, "\xff\x02/idmp0"_sr); const KeyRangeRef idempotencyIdKeys("\xff\x02/idmp/"_sr, "\xff\x02/idmp0"_sr);

View File

@ -18,6 +18,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fdbrpc/Msgpack.h"
#include "fdbclient/Tracing.h" #include "fdbclient/Tracing.h"
#include "flow/IRandom.h" #include "flow/IRandom.h"
#include "flow/UnitTest.h" #include "flow/UnitTest.h"
@ -79,41 +80,6 @@ struct LogfileTracer : ITracer {
} }
}; };
struct TraceRequest {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "TracingSpanResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
// A server listening for UDP trace messages, run only in simulation. // A server listening for UDP trace messages, run only in simulation.
ACTOR Future<Void> simulationStartServer() { ACTOR Future<Void> simulationStartServer() {
// We're going to force the address to be loopback regardless of FLOW_KNOBS->TRACING_UDP_LISTENER_ADDR // We're going to force the address to be loopback regardless of FLOW_KNOBS->TRACING_UDP_LISTENER_ADDR
@ -167,146 +133,89 @@ ACTOR Future<Void> traceLog(int* pendingMessages, bool* sendError) {
struct UDPTracer : public ITracer { struct UDPTracer : public ITracer {
// Serializes span fields as an array into the supplied TraceRequest // Serializes span fields as an array into the supplied TraceRequest
// buffer. // buffer.
void serialize_span(const Span& span, TraceRequest& request) { void serialize_span(const Span& span, MsgpackBuffer& buf) {
uint16_t size = 12; uint16_t size = 12;
request.write_byte(size | 0b10010000); // write as array buf.write_byte(size | 0b10010000); // write as array
serialize_value(span.context.traceID.first(), request, 0xcf); // trace id serialize_value(span.context.traceID.first(), buf, 0xcf); // trace id
serialize_value(span.context.traceID.second(), request, 0xcf); // trace id serialize_value(span.context.traceID.second(), buf, 0xcf); // trace id
serialize_value(span.context.spanID, request, 0xcf); // spanid serialize_value(span.context.spanID, buf, 0xcf); // spanid
// parent span id // parent span id
serialize_value(span.parentContext.spanID, request, 0xcf); // spanId serialize_value(span.parentContext.spanID, buf, 0xcf); // spanId
// Payload // Payload
serialize_string(span.location.name.toString(), request); serialize_string(span.location.name.toString(), buf);
serialize_value(span.begin, request, 0xcb); // start time serialize_value(span.begin, buf, 0xcb); // start time
serialize_value(span.end, request, 0xcb); // end serialize_value(span.end, buf, 0xcb); // end
// Kind // Kind
serialize_value(span.kind, request, 0xcc); serialize_value(span.kind, buf, 0xcc);
// Status // Status
serialize_value(span.status, request, 0xcc); serialize_value(span.status, buf, 0xcc);
// Links // Links
serialize_vector(span.links, request); serialize_vector(span.links, buf);
// Events // Events
serialize_vector(span.events, request); serialize_vector(span.events, buf);
// Attributes // Attributes
serialize_map(span.attributes, request); serialize_map(span.attributes, buf);
} }
private: private:
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, TraceRequest& request, uint8_t msgpack_type) {
request.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
request.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, TraceRequest& request) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
request.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
request.write_byte(0xd9);
request.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
request.write_byte(0xda);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeString")
.detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
request.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, TraceRequest& request) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), request);
}
// Writes the given vector of linked SpanContext's to the request. If the vector is // Writes the given vector of linked SpanContext's to the request. If the vector is
// empty, the request is not modified. // empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, TraceRequest& request) { inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, MsgpackBuffer& buf) {
int size = vec.size(); int size = vec.size();
if (size <= 15) { if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000); buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) { } else if (size <= 65535) {
request.write_byte(0xdc); buf.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]); buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]); buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else { } else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size); TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false); ASSERT_WE_THINK(false);
} }
for (const auto& link : vec) { for (const auto& link : vec) {
serialize_value(link.traceID.first(), request, 0xcf); // trace id serialize_value(link.traceID.first(), buf, 0xcf); // trace id
serialize_value(link.traceID.second(), request, 0xcf); // trace id serialize_value(link.traceID.second(), buf, 0xcf); // trace id
serialize_value(link.spanID, request, 0xcf); // spanid serialize_value(link.spanID, buf, 0xcf); // spanid
} }
} }
// Writes the given vector of linked SpanContext's to the request. If the vector is // Writes the given vector of linked SpanEventRef's to the request. If the vector is
// empty, the request is not modified. // empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, TraceRequest& request) { inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, MsgpackBuffer& buf) {
int size = vec.size(); int size = vec.size();
if (size <= 15) { if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000); buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) { } else if (size <= 65535) {
request.write_byte(0xdc); buf.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]); buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]); buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else { } else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size); TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false); ASSERT_WE_THINK(false);
} }
for (const auto& event : vec) { for (const auto& event : vec) {
serialize_string(event.name.toString(), request); // event name serialize_string(event.name.toString(), buf); // event name
serialize_value(event.time, request, 0xcb); // event time serialize_value(event.time, buf, 0xcb); // event time
serialize_vector(event.attributes, request); serialize_vector(event.attributes, buf);
} }
} }
inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, TraceRequest& request) { inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, MsgpackBuffer& buf) {
int size = vals.size(); int size = vals.size();
if (size <= 15) { if (size <= 15) {
// N.B. We're actually writing this out as a fixmap here in messagepack format! // N.B. We're actually writing this out as a fixmap here in messagepack format!
// fixmap 1000xxxx 0x80 - 0x8f // fixmap 1000xxxx 0x80 - 0x8f
request.write_byte(static_cast<uint8_t>(size) | 0b10000000); buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else { } else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size); TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false); ASSERT_WE_THINK(false);
} }
for (const auto& kv : vals) { for (const auto& kv : vals) {
serialize_string(kv.key.toString(), request); serialize_string(kv.key.toString(), buf);
serialize_string(kv.value.toString(), request); serialize_string(kv.value.toString(), buf);
}
}
template <class Map>
inline void serialize_map(const Map& map, TraceRequest& request) {
int size = map.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), request);
serialize_string(value.begin(), value.size(), request);
} }
} }
}; };
@ -336,9 +245,9 @@ ACTOR Future<Void> fastTraceLogger(int* unreadyMessages, int* failedMessages, in
struct FastUDPTracer : public UDPTracer { struct FastUDPTracer : public UDPTracer {
FastUDPTracer() FastUDPTracer()
: unready_socket_messages_(0), failed_messages_(0), total_messages_(0), socket_fd_(-1), send_error_(false) { : unready_socket_messages_(0), failed_messages_(0), total_messages_(0), socket_fd_(-1), send_error_(false) {
request_ = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize), request_ = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0, .data_size = 0,
.buffer_size = kTraceBufferSize }; .buffer_size = kTraceBufferSize };
} }
TracerType type() const override { return TracerType::NETWORK_LOSSY; } TracerType type() const override { return TracerType::NETWORK_LOSSY; }
@ -394,7 +303,7 @@ struct FastUDPTracer : public UDPTracer {
} }
private: private:
TraceRequest request_; MsgpackBuffer request_;
int unready_socket_messages_; int unready_socket_messages_;
int failed_messages_; int failed_messages_;
@ -657,9 +566,9 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled", IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled",
KnobValueRef::create(bool{ true })); KnobValueRef::create(bool{ true }));
Span span1("encoded_span"_loc); Span span1("encoded_span"_loc);
auto request = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize), auto request = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0, .data_size = 0,
.buffer_size = kTraceBufferSize }; .buffer_size = kTraceBufferSize };
auto tracer = FastUDPTracer(); auto tracer = FastUDPTracer();
tracer.serialize_span(span1, request); tracer.serialize_span(span1, request);
auto data = request.buffer.get(); auto data = request.buffer.get();

View File

@ -313,4 +313,15 @@ struct BlobManifest {
} }
}; };
// Defines blob restore status
struct BlobRestoreStatus {
constexpr static FileIdentifier file_identifier = 378657;
int progress;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, progress);
}
};
#endif #endif

View File

@ -56,4 +56,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix); std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
#endif // For benchmark testing only. It should never be called in prod.
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);
#endif

View File

@ -403,6 +403,7 @@ public:
Future<Version> verifyBlobRange(const KeyRange& range, Future<Version> verifyBlobRange(const KeyRange& range,
Optional<Version> version, Optional<Version> version,
Optional<TenantName> tenantName = {}); Optional<TenantName> tenantName = {});
Future<bool> blobRestore(const KeyRange range);
// private: // private:
explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord, explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,

View File

@ -163,9 +163,10 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes // storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID); ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant // Set/clear/get the storage quota for the given tenant group
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota); void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName); void clearStorageQuota(Transaction& tr, StringRef tenantGroupName);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName);
#include "flow/unactorcompiler.h" #include "flow/unactorcompiler.h"
#endif #endif

View File

@ -237,6 +237,8 @@ public:
int64_t int64_t
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED; bool DD_TENANT_AWARENESS_ENABLED;
bool STORAGE_QUOTA_ENABLED; // Whether storage quota enforcement for tenant groups and all the relevant storage
// usage / quota monitors are enabled.
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
// in the TenantCache // in the TenantCache
@ -761,14 +763,16 @@ public:
bool ENABLE_CLEAR_RANGE_EAGER_READS; bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK; bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK; bool QUICK_GET_KEY_VALUES_FALLBACK;
bool STRICTLY_ENFORCE_BYTE_LIMIT;
double FRACTION_INDEX_BYTELIMIT_PREFETCH;
int MAX_PARALLEL_QUICK_GET_VALUE; int MAX_PARALLEL_QUICK_GET_VALUE;
int CHECKPOINT_TRANSFER_BLOCK_BYTES; int CHECKPOINT_TRANSFER_BLOCK_BYTES;
int QUICK_GET_KEY_VALUES_LIMIT; int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES; int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT; int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES; std::string STORAGESERVER_READ_PRIORITIES;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
// Wait Failure // Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS; int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -917,7 +921,7 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS; std::string REDWOOD_IO_PRIORITIES;
// Server request latency measurement // Server request latency measurement
int LATENCY_SAMPLE_SIZE; int LATENCY_SAMPLE_SIZE;

View File

@ -710,11 +710,18 @@ UID decodeBlobWorkerListKey(KeyRef const& key);
const Value blobWorkerListValue(BlobWorkerInterface const& interface); const Value blobWorkerListValue(BlobWorkerInterface const& interface);
BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value); BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value);
// Blob restore command
extern const KeyRangeRef blobRestoreCommandKeys;
const Value blobRestoreCommandKeyFor(const KeyRangeRef range);
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key);
const Value blobRestoreCommandValueFor(BlobRestoreStatus status);
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value);
// Storage quota per tenant // Storage quota per tenant
// "\xff/storageQuota/[[tenantName]]" := "[[quota]]" // "\xff/storageQuota/[[tenantGroupName]]" := "[[quota]]"
extern const KeyRangeRef storageQuotaKeys; extern const KeyRangeRef storageQuotaKeys;
extern const KeyRef storageQuotaPrefix; extern const KeyRef storageQuotaPrefix;
Key storageQuotaKey(StringRef tenantName); Key storageQuotaKey(StringRef tenantGroupName);
extern const KeyRangeRef idempotencyIdKeys; extern const KeyRangeRef idempotencyIdKeys;
extern const KeyRef idempotencyIdsExpiredVersion; extern const KeyRef idempotencyIdsExpiredVersion;

View File

@ -0,0 +1,157 @@
/*
* Msgpack.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBRPC_MSGPACK_H
#define FDBRPC_MSGPACK_H
#include <limits>
#pragma once
#include <memory>
#include <algorithm>
#include "flow/Trace.h"
#include "flow/Error.h"
#include "flow/network.h"
struct MsgpackBuffer {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
// This assumes that pos <= data_size
void edit_byte(uint8_t byte, size_t pos) { buffer[pos] = byte; }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "MsgpackResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
inline void serialize_bool(bool val, MsgpackBuffer& buf) {
if (val) {
buf.write_byte(0xc3);
} else {
buf.write_byte(0xc2);
}
}
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, MsgpackBuffer& buf, uint8_t msgpack_type) {
buf.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
buf.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, MsgpackBuffer& buf) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
buf.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
buf.write_byte(0xd9);
buf.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
buf.write_byte(0xda);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "MsgpackSerializeString").detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
buf.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, MsgpackBuffer& buf) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), buf);
}
template <typename T, typename F>
inline void serialize_vector(const std::vector<T>& vec, MsgpackBuffer& buf, F f) {
size_t size = vec.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else if (size <= std::numeric_limits<uint32_t>::max()) {
buf.write_byte(0xdd);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[3]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[2]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "MsgPackSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
// Use the provided serializer function to serialize the individual types of the vector
for (const auto& val : vec) {
f(val, buf);
}
}
template <class Map>
inline void serialize_map(const Map& map, MsgpackBuffer& buf) {
int size = map.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "MsgPackSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), buf);
serialize_string(value.begin(), value.size(), buf);
}
}
#endif

View File

@ -20,6 +20,7 @@
#ifndef FDBRPC_TIMED_REQUEST_H #ifndef FDBRPC_TIMED_REQUEST_H
#define FDBRPC_TIMED_REQUEST_H #define FDBRPC_TIMED_REQUEST_H
#include "flow/network.h"
#pragma once #pragma once
#include <fdbrpc/fdbrpc.h> #include <fdbrpc/fdbrpc.h>
@ -35,7 +36,7 @@ public:
TimedRequest() { TimedRequest() {
if (!FlowTransport::isClient()) { if (!FlowTransport::isClient()) {
_requestTime = timer(); _requestTime = g_network->timer();
} else { } else {
_requestTime = 0.0; _requestTime = 0.0;
} }

View File

@ -388,6 +388,8 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
Promise<Void> iAmReplaced; Promise<Void> iAmReplaced;
bool isFullRestoreMode = false;
BlobManagerData(UID id, BlobManagerData(UID id,
Reference<AsyncVar<ServerDBInfo> const> dbInfo, Reference<AsyncVar<ServerDBInfo> const> dbInfo,
Database db, Database db,
@ -3537,7 +3539,10 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
bmData->startRecruiting.trigger(); bmData->startRecruiting.trigger();
bmData->initBStore(); bmData->initBStore();
if (isFullRestoreMode()) {
bool isFullRestore = wait(isFullRestoreMode(bmData->db, normalKeys));
bmData->isFullRestoreMode = isFullRestore;
if (bmData->isFullRestoreMode) {
wait(loadManifest(bmData->db, bmData->bstore)); wait(loadManifest(bmData->db, bmData->bstore));
int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore)); int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore));
@ -5297,11 +5302,8 @@ ACTOR Future<Void> backupManifest(Reference<BlobManagerData> bmData) {
bmData->initBStore(); bmData->initBStore();
loop { loop {
bool pendingSplit = wait(hasPendingSplit(bmData)); wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
if (!pendingSplit) { bmData->manifestDumperSeqNo++;
wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
bmData->manifestDumperSeqNo++;
}
wait(delay(SERVER_KNOBS->BLOB_MANIFEST_BACKUP_INTERVAL)); wait(delay(SERVER_KNOBS->BLOB_MANIFEST_BACKUP_INTERVAL));
} }
} }
@ -5370,7 +5372,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
if (SERVER_KNOBS->BG_ENABLE_MERGING) { if (SERVER_KNOBS->BG_ENABLE_MERGING) {
self->addActor.send(granuleMergeChecker(self)); self->addActor.send(granuleMergeChecker(self));
} }
if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !isFullRestoreMode()) { if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !self->isFullRestoreMode) {
self->addActor.send(backupManifest(self)); self->addActor.send(backupManifest(self));
} }

View File

@ -60,7 +60,7 @@ struct BlobManifestFile {
int64_t seqNo{ 0 }; int64_t seqNo{ 0 };
BlobManifestFile(const std::string& path) { BlobManifestFile(const std::string& path) {
if (sscanf(path.c_str(), MANIFEST_FOLDER "/manifest.%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) { if (sscanf(path.c_str(), MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) {
fileName = path; fileName = path;
} }
} }
@ -76,7 +76,7 @@ struct BlobManifestFile {
BlobManifestFile file(path); BlobManifestFile file(path);
return file.epoch > 0 && file.seqNo > 0; return file.epoch > 0 && file.seqNo > 0;
}; };
BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER, filter)); BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER "/", filter));
std::vector<BlobManifestFile> result; std::vector<BlobManifestFile> result;
for (auto& f : filesAndSizes) { for (auto& f : filesAndSizes) {
@ -107,6 +107,9 @@ public:
try { try {
state Standalone<BlobManifest> manifest; state Standalone<BlobManifest> manifest;
Standalone<VectorRef<KeyValueRef>> rows = wait(getSystemKeys(self)); Standalone<VectorRef<KeyValueRef>> rows = wait(getSystemKeys(self));
if (rows.size() == 0) {
return Void();
}
manifest.rows = rows; manifest.rows = rows;
Value data = encode(manifest); Value data = encode(manifest);
wait(writeToFile(self, data)); wait(writeToFile(self, data));
@ -153,7 +156,8 @@ private:
state std::string fullPath; state std::string fullPath;
std::tie(writer, fullPath) = self->blobConn_->createForWrite(MANIFEST_FOLDER); std::tie(writer, fullPath) = self->blobConn_->createForWrite(MANIFEST_FOLDER);
state std::string fileName = format(MANIFEST_FOLDER "/manifest.%lld.%lld", self->epoch_, self->seqNo_); state std::string fileName =
format(MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%lld.%lld", self->epoch_, self->seqNo_);
state Reference<IBackupFile> file = wait(writer->writeFile(fileName)); state Reference<IBackupFile> file = wait(writer->writeFile(fileName));
wait(file->append(data.begin(), data.size())); wait(file->append(data.begin(), data.size()));
wait(file->finish()); wait(file->finish());
@ -453,3 +457,26 @@ ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider
int64_t epoc = wait(BlobManifestLoader::lastBlobEpoc(loader)); int64_t epoc = wait(BlobManifestLoader::lastBlobEpoc(loader));
return epoc; return epoc;
} }
// Return true if the given key range is restoring
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
state Transaction tr(db);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
RangeResult ranges = wait(tr.getRange(blobRestoreCommandKeys, CLIENT_KNOBS->TOO_MANY));
for (auto& r : ranges) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
}
}
return false;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}

View File

@ -21,6 +21,7 @@
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
#include "flow/FastRef.h" #include "flow/FastRef.h"
#include "flow/IRandom.h" #include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/flow.h" #include "flow/flow.h"
#include "fdbclient/StorageServerInterface.h" #include "fdbclient/StorageServerInterface.h"
#include "fdbclient/BlobConnectionProvider.h" #include "fdbclient/BlobConnectionProvider.h"
@ -63,14 +64,7 @@ public:
// Start migration // Start migration
ACTOR static Future<Void> start(Reference<BlobMigrator> self) { ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
if (!isFullRestoreMode()) { wait(checkIfReadyForMigration(self));
return Void();
}
wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
self->blobGranules_ = granules;
wait(prepare(self, normalKeys)); wait(prepare(self, normalKeys));
wait(advanceVersion(self)); wait(advanceVersion(self));
wait(serverLoop(self)); wait(serverLoop(self));
@ -78,6 +72,28 @@ public:
} }
private: private:
// Check if blob manifest is loaded so that blob migration can start
ACTOR static Future<Void> checkIfReadyForMigration(Reference<BlobMigrator> self) {
loop {
bool isFullRestore = wait(isFullRestoreMode(self->db_, normalKeys));
if (isFullRestore) {
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
if (!granules.empty()) {
self->blobGranules_ = granules;
for (BlobGranuleRestoreVersion granule : granules) {
TraceEvent("RestorableGranule")
.detail("GranuleId", granule.granuleID.toString())
.detail("KeyRange", granule.keyRange.toString())
.detail("Version", granule.version)
.detail("SizeInBytes", granule.sizeInBytes);
}
return Void();
}
}
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
// Prepare for data migration for given key range. // Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) { ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after // Register as a storage server, so that DataDistributor could start data movement after
@ -136,8 +152,9 @@ private:
} }
} }
if (owning) { if (owning) {
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse)); wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
TraceEvent("UnassignKeys").detail("Keys", keys.toString()).detail("From", id.toString());
} }
} }
wait(tr.commit()); wait(tr.commit());
@ -185,8 +202,10 @@ private:
// Calculated progress // Calculated progress
int64_t total = sizeInBytes(self); int64_t total = sizeInBytes(self);
int progress = (total - incompleted) * 100 / total; int progress = (total - incompleted) * 100 / total;
bool done = incompleted == 0; state bool done = incompleted == 0;
dprint("Progress {} :{}%. done {}\n", serverID.toString(), progress, done); dprint("Migration progress :{}%. done {}\n", progress, done);
TraceEvent("BlobMigratorProgress").detail("Progress", progress).detail("Done", done);
wait(updateProgress(self, normalKeys, progress));
return done; return done;
} catch (Error& e) { } catch (Error& e) {
wait(tr.onError(e)); wait(tr.onError(e));
@ -194,6 +213,32 @@ private:
} }
} }
// Update restore progress
ACTOR static Future<Void> updateProgress(Reference<BlobMigrator> self, KeyRangeRef range, int progress) {
state Transaction tr(self->db_);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr.get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (progress > status.progress) {
status.progress = progress;
Value updatedValue = blobRestoreCommandValueFor(status);
tr.set(key, updatedValue);
wait(tr.commit());
}
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data // Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) { ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_); state Transaction tr(self->db_);
@ -207,6 +252,7 @@ private:
if (currentVersion <= expectedVersion) { if (currentVersion <= expectedVersion) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned())); tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned()));
dprint("Advance version from {} to {}\n", currentVersion, expectedVersion); dprint("Advance version from {} to {}\n", currentVersion, expectedVersion);
TraceEvent("AdvanceVersion").detail("Current", currentVersion).detail("New", expectedVersion);
wait(tr.commit()); wait(tr.commit());
} }
return Void(); return Void();
@ -218,7 +264,7 @@ private:
// Main server loop // Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) { ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture())); self->actors_.add(waitFailureServer(self->interf_.waitFailure.getFuture()));
self->actors_.add(logProgress(self)); self->actors_.add(logProgress(self));
self->actors_.add(handleRequest(self)); self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self)); self->actors_.add(handleUnsupportedRequest(self));
@ -226,6 +272,7 @@ private:
try { try {
choose { choose {
when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) { when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
dprint("Stopping blob migrator {}\n", self->interf_.id().toString());
req.reply.send(Void()); req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID); TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
break; break;
@ -237,6 +284,8 @@ private:
throw; throw;
} }
} }
self->actors_.clear(true);
dprint("Stopped blob migrator {}\n", self->interf_.id().toString());
return Void(); return Void();
} }
@ -267,7 +316,7 @@ private:
req.reply.send(rep); req.reply.send(rep);
} }
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) { when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
fmt::print("Handle GetStorageMetrics\n"); // fmt::print("Handle GetStorageMetrics\n");
StorageMetrics metrics; StorageMetrics metrics;
metrics.bytes = sizeInBytes(self); metrics.bytes = sizeInBytes(self);
GetStorageMetricsReply resp; GetStorageMetricsReply resp;
@ -331,7 +380,7 @@ private:
req.reply.sendError(unsupported_operation()); req.reply.sendError(unsupported_operation());
} }
when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) { when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
dprint("Unsupported UpdateCommitCostRequest\n"); // dprint("Unsupported UpdateCommitCostRequest\n");
req.reply.sendError(unsupported_operation()); req.reply.sendError(unsupported_operation());
} }
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) { when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
@ -358,9 +407,9 @@ private:
} }
ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) { ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
dprint("Unsupported StorageQueuingMetricsRequest\n"); // dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD // FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes // processes
wait(delay(1)); wait(delay(1));
req.reply.sendError(unsupported_operation()); req.reply.sendError(unsupported_operation());
return Void(); return Void();
@ -398,7 +447,8 @@ private:
// Main entry point // Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) { ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", interf.id().toString()); TraceEvent("StartBlobMigrator").detail("Interface", interf.id().toString());
dprint("Starting blob migrator {}\n", interf.id().toString());
try { try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf); Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
wait(BlobMigrator::start(self)); wait(BlobMigrator::start(self));

View File

@ -292,6 +292,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int64_t lastResidentMemory = 0; int64_t lastResidentMemory = 0;
double lastResidentMemoryCheckTime = -100.0; double lastResidentMemoryCheckTime = -100.0;
bool isFullRestoreMode = false;
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db) BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), tenantData(BGTenantMap(dbInfo)), dbInfo(dbInfo), : id(id), db(db), tenantData(BGTenantMap(dbInfo)), dbInfo(dbInfo),
initialSnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM)), initialSnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM)),
@ -2146,7 +2148,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
} }
// No need to start Change Feed in full restore mode // No need to start Change Feed in full restore mode
if (isFullRestoreMode()) if (bwData->isFullRestoreMode)
return Void(); return Void();
checkMergeCandidate = granuleCheckMergeCandidate(bwData, checkMergeCandidate = granuleCheckMergeCandidate(bwData,
@ -3588,7 +3590,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state Reference<GranuleMetadata> metadata = m; state Reference<GranuleMetadata> metadata = m;
// state Version granuleBeginVersion = req.beginVersion; // state Version granuleBeginVersion = req.beginVersion;
// skip waiting for CF ready for recovery mode // skip waiting for CF ready for recovery mode
if (!isFullRestoreMode()) { if (!bwData->isFullRestoreMode) {
choose { choose {
when(wait(metadata->readable.getFuture())) {} when(wait(metadata->readable.getFuture())) {}
when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); } when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); }
@ -3646,7 +3648,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// this is an active granule query // this is an active granule query
loop { loop {
// skip check since CF doesn't start for bare metal recovery mode // skip check since CF doesn't start for bare metal recovery mode
if (isFullRestoreMode()) { if (bwData->isFullRestoreMode) {
break; break;
} }
if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) { if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) {
@ -3689,7 +3691,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it // if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it
// got popped, but we can still reject the in theory this should never happen with other protections but // got popped, but we can still reject the in theory this should never happen with other protections but
// it's a useful and inexpensive sanity check // it's a useful and inexpensive sanity check
if (!isFullRestoreMode()) { if (!bwData->isFullRestoreMode) {
Version emptyVersion = metadata->activeCFData.get()->popVersion - 1; Version emptyVersion = metadata->activeCFData.get()->popVersion - 1;
if (req.readVersion > metadata->durableDeltaVersion.get() && if (req.readVersion > metadata->durableDeltaVersion.get() &&
emptyVersion > metadata->bufferedDeltaVersion) { emptyVersion > metadata->bufferedDeltaVersion) {
@ -3995,6 +3997,9 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
throw granule_assignment_conflict(); throw granule_assignment_conflict();
} }
bool isFullRestore = wait(isFullRestoreMode(bwData->db, req.keyRange));
bwData->isFullRestoreMode = isFullRestore;
Optional<Value> prevLockValue = wait(fLockValue); Optional<Value> prevLockValue = wait(fLockValue);
state bool hasPrevOwner = prevLockValue.present(); state bool hasPrevOwner = prevLockValue.present();
state bool createChangeFeed = false; state bool createChangeFeed = false;
@ -4069,7 +4074,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
} }
// for recovery mode - don't create change feed, don't create snapshot // for recovery mode - don't create change feed, don't create snapshot
if (isFullRestoreMode()) { if (bwData->isFullRestoreMode) {
createChangeFeed = false; createChangeFeed = false;
info.doSnapshot = false; info.doSnapshot = false;
GranuleFiles granuleFiles = wait(loadPreviousFiles(&tr, info.granuleID)); GranuleFiles granuleFiles = wait(loadPreviousFiles(&tr, info.granuleID));
@ -4091,7 +4096,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
} }
} }
if (createChangeFeed && !isFullRestoreMode()) { if (createChangeFeed && !bwData->isFullRestoreMode) {
// create new change feed for new version of granule // create new change feed for new version of granule
wait(updateChangeFeed( wait(updateChangeFeed(
&tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange)); &tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
@ -4103,7 +4108,8 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// If anything in previousGranules, need to do the handoff logic and set // If anything in previousGranules, need to do the handoff logic and set
// ret.previousChangeFeedId, and the previous durable version will come from the previous // ret.previousChangeFeedId, and the previous durable version will come from the previous
// granules // granules
if (info.history.present() && info.history.get().value.parentVersions.size() > 0 && !isFullRestoreMode()) { if (info.history.present() && info.history.get().value.parentVersions.size() > 0 &&
!bwData->isFullRestoreMode) {
CODE_PROBE(true, "Granule open found parent"); CODE_PROBE(true, "Granule open found parent");
if (info.history.get().value.parentVersions.size() == 1) { // split if (info.history.get().value.parentVersions.size() == 1) { // split
state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0], state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],

View File

@ -23,6 +23,7 @@
#include <map> #include <map>
#include <memory> #include <memory>
#include <set> #include <set>
#include <tuple>
#include <vector> #include <vector>
#include "fdbclient/FDBTypes.h" #include "fdbclient/FDBTypes.h"
@ -691,7 +692,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
WorkerDetails newMGWorker; WorkerDetails newMGWorker;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used); newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used);
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
newMGWorker = findNewProcessForSingleton(self, ProcessClass::BlobMigrator, id_used); newMGWorker = findNewProcessForSingleton(self, ProcessClass::BlobMigrator, id_used);
} }
} }
@ -710,7 +711,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ProcessClass::Fitness bestFitnessForMG; ProcessClass::Fitness bestFitnessForMG;
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager); bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager);
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
bestFitnessForMG = findBestFitnessForSingleton(self, newMGWorker, ProcessClass::BlobManager); bestFitnessForMG = findBestFitnessForSingleton(self, newMGWorker, ProcessClass::BlobManager);
} }
} }
@ -744,7 +745,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
bmHealthy = isHealthySingleton<BlobManagerInterface>( bmHealthy = isHealthySingleton<BlobManagerInterface>(
self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID); self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID);
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
mgHealthy = isHealthySingleton<BlobMigratorInterface>( mgHealthy = isHealthySingleton<BlobMigratorInterface>(
self, newMGWorker, mgSingleton, bestFitnessForMG, self->recruitingBlobMigratorID); self, newMGWorker, mgSingleton, bestFitnessForMG, self->recruitingBlobMigratorID);
} }
@ -775,7 +776,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
currBMProcessId = bmSingleton.interface.get().locality.processId(); currBMProcessId = bmSingleton.interface.get().locality.processId();
newBMProcessId = newBMWorker.interf.locality.processId(); newBMProcessId = newBMWorker.interf.locality.processId();
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
currMGProcessId = mgSingleton.interface.get().locality.processId(); currMGProcessId = mgSingleton.interface.get().locality.processId();
newMGProcessId = newMGWorker.interf.locality.processId(); newMGProcessId = newMGWorker.interf.locality.processId();
} }
@ -792,7 +793,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) { if (self->db.blobGranulesEnabled.get()) {
currPids.emplace_back(currBMProcessId); currPids.emplace_back(currBMProcessId);
newPids.emplace_back(newBMProcessId); newPids.emplace_back(newBMProcessId);
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
currPids.emplace_back(currMGProcessId); currPids.emplace_back(currMGProcessId);
newPids.emplace_back(newMGProcessId); newPids.emplace_back(newMGProcessId);
} }
@ -810,7 +811,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (!self->db.blobGranulesEnabled.get()) { if (!self->db.blobGranulesEnabled.get()) {
ASSERT(currColocMap[currBMProcessId] == 0); ASSERT(currColocMap[currBMProcessId] == 0);
ASSERT(newColocMap[newBMProcessId] == 0); ASSERT(newColocMap[newBMProcessId] == 0);
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
ASSERT(currColocMap[currMGProcessId] == 0); ASSERT(currColocMap[currMGProcessId] == 0);
ASSERT(newColocMap[newMGProcessId] == 0); ASSERT(newColocMap[newMGProcessId] == 0);
} }
@ -836,7 +837,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ddSingleton.recruit(self); ddSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) { } else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) {
bmSingleton.recruit(self); bmSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() && } else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get() &&
newColocMap[newMGProcessId] < currColocMap[currMGProcessId]) { newColocMap[newMGProcessId] < currColocMap[currMGProcessId]) {
mgSingleton.recruit(self); mgSingleton.recruit(self);
} else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) { } else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) {
@ -1404,13 +1405,13 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID); self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID);
} }
if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() && req.blobManagerInterf.present()) { if (self->db.blobGranulesEnabled.get() && req.blobManagerInterf.present()) {
auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager); auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager);
auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf); auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf);
haltRegisteringOrCurrentSingleton<BlobManagerInterface>( haltRegisteringOrCurrentSingleton<BlobManagerInterface>(
self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID); self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
} }
if (req.blobMigratorInterf.present()) { if (req.blobMigratorInterf.present() && self->db.blobRestoreEnabled.get()) {
auto currSingleton = BlobMigratorSingleton(self->db.serverInfo->get().blobMigrator); auto currSingleton = BlobMigratorSingleton(self->db.serverInfo->get().blobMigrator);
auto registeringSingleton = BlobMigratorSingleton(req.blobMigratorInterf); auto registeringSingleton = BlobMigratorSingleton(req.blobMigratorInterf);
haltRegisteringOrCurrentSingleton<BlobMigratorInterface>( haltRegisteringOrCurrentSingleton<BlobMigratorInterface>(
@ -2553,6 +2554,43 @@ ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
} }
} }
ACTOR Future<Void> watchBlobRestoreCommand(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
state Key blobRestoreCommandKey = blobRestoreCommandKeyFor(normalKeys);
loop {
try {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> blobRestoreCommand = wait(tr->get(blobRestoreCommandKey));
if (blobRestoreCommand.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(blobRestoreCommand.get());
TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress);
if (status.progress == 0) {
self->db.blobRestoreEnabled.set(true);
if (self->db.blobGranulesEnabled.get()) {
const auto& blobManager = self->db.serverInfo->get().blobManager;
if (blobManager.present()) {
BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId());
}
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
if (blobMigrator.present()) {
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
}
}
}
state Future<Void> watch = tr->watch(blobRestoreCommandKey);
wait(tr->commit());
wait(watch);
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> startBlobMigrator(ClusterControllerData* self, double waitTime) { ACTOR Future<Void> startBlobMigrator(ClusterControllerData* self, double waitTime) {
// If master fails at the same time, give it a chance to clear master PID. // If master fails at the same time, give it a chance to clear master PID.
// Also wait to avoid too many consecutive recruits in a small time window. // Also wait to avoid too many consecutive recruits in a small time window.
@ -2629,9 +2667,8 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
} }
loop { loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) { if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient = state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure, SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop { loop {
choose { choose {
when(wait(wfClient)) { when(wait(wfClient)) {
@ -2643,11 +2680,11 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
when(wait(self->recruitBlobMigrator.onChange())) {} when(wait(self->recruitBlobMigrator.onChange())) {}
} }
} }
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode()) { } else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get()) {
// if there is no blob migrator present but blob granules are now enabled, recruit a BM // if there is no blob migrator present but blob granules are now enabled, recruit a BM
wait(startBlobMigrator(self, recruitThrottler.newRecruitment())); wait(startBlobMigrator(self, recruitThrottler.newRecruitment()));
} else { } else {
wait(self->db.blobGranulesEnabled.onChange()); wait(self->db.blobGranulesEnabled.onChange() || self->db.blobRestoreEnabled.onChange());
} }
} }
} }
@ -2778,7 +2815,7 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
const auto& blobManager = self->db.serverInfo->get().blobManager; const auto& blobManager = self->db.serverInfo->get().blobManager;
BlobManagerSingleton(blobManager) BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId()); .haltBlobGranules(self, blobManager.get().locality.processId());
if (isFullRestoreMode()) { if (self->db.blobRestoreEnabled.get()) {
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator; const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId()); BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
} }
@ -3079,8 +3116,9 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorDataDistributor(&self)); self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self)); self.addActor.send(monitorRatekeeper(&self));
self.addActor.send(monitorBlobManager(&self)); self.addActor.send(monitorBlobManager(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobGranulesConfigKey(&self)); self.addActor.send(watchBlobGranulesConfigKey(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobRestoreCommand(&self));
self.addActor.send(monitorConsistencyScan(&self)); self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self)); self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self)); self.addActor.send(dbInfoUpdater(&self));

View File

@ -414,7 +414,8 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
} }
Optional<TenantNameRef> const& tenantName = req.tenantInfo.name; Optional<TenantNameRef> const& tenantName = req.tenantInfo.name;
if (tenantName.present() && commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) { if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED && tenantName.present() &&
commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
req.reply.sendError(storage_quota_exceeded()); req.reply.sendError(storage_quota_exceeded());
continue; continue;
} }
@ -829,7 +830,7 @@ ACTOR Future<Void> preresolutionProcessing(CommitBatchContext* self) {
SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) { SERVER_KNOBS->PROXY_REJECT_BATCH_QUEUED_TOO_LONG && canReject(trs)) {
// Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries. // Disabled for the recovery transaction. otherwise, recovery can't finish and keeps doing more recoveries.
CODE_PROBE(true, "Reject transactions in the batch"); CODE_PROBE(true, "Reject transactions in the batch");
TraceEvent(SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid) TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyReject", pProxyCommitData->dbgid)
.suppressFor(0.1) .suppressFor(0.1)
.detail("QDelay", queuingDelay) .detail("QDelay", queuingDelay)
.detail("Transactions", trs.size()) .detail("Transactions", trs.size())
@ -2971,7 +2972,9 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
proxy.expireIdempotencyId, proxy.expireIdempotencyId,
commitData.expectedIdempotencyIdCountForKey, commitData.expectedIdempotencyIdCountForKey,
&commitData.idempotencyClears)); &commitData.idempotencyClears));
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData)); if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
}
// wait for txnStateStore recovery // wait for txnStateStore recovery
wait(success(commitData.txnStateStore->readValue(StringRef()))); wait(success(commitData.txnStateStore->readValue(StringRef())));

View File

@ -1423,6 +1423,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
state double startTime = now(); state double startTime = now();
state std::vector<UID> destIds; state std::vector<UID> destIds;
state uint64_t debugID = deterministicRandom()->randomUInt64(); state uint64_t debugID = deterministicRandom()->randomUInt64();
state bool enableShardMove = SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD;
try { try {
if (now() - self->lastInterval < 1.0) { if (now() - self->lastInterval < 1.0) {
@ -1539,8 +1540,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
req.src = rd.src; req.src = rd.src;
req.completeSources = rd.completeSources; req.completeSources = rd.completeSources;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD && if (enableShardMove && tciIndex == 1) {
tciIndex == 1) {
ASSERT(physicalShardIDCandidate != UID().first() && ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first()); physicalShardIDCandidate != anonymousShardId.first());
Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard = Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
@ -1587,64 +1587,58 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
anyWithSource = true; anyWithSource = true;
} }
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { if (enableShardMove) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
// a remote team
if (tciIndex == 1 && !forceToUseNewPhysicalShard) { if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true); bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) { if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull; retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false; foundTeams = false;
break; break;
} }
}
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { // critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
if (!bestTeam.first.get()->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
break;
}
}
bestTeams.emplace_back(bestTeam.first.get(), true); bestTeams.emplace_back(bestTeam.first.get(), true);
// Always set bestTeams[i].second = true to disable optimization in data move between DCs // Always set bestTeams[i].second = true to disable optimization in data move between DCs
// for the correctness of PhysicalShardCollection // for the correctness of PhysicalShardCollection
// Currently, enabling the optimization will break the invariant of PhysicalShardCollection // Currently, enabling the optimization will break the invariant of PhysicalShardCollection
// Invariant: once a physical shard is created with a specific set of SSes, this SS set will // Invariant: once a physical shard is created with a specific set of SSes, this SS set will
// never get changed. // never get changed.
if (tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
} else { } else {
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second); bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
} }
// get physicalShardIDCandidate
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
} }
tciIndex++; tciIndex++;
} }
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
// In this case, we must re-select a remote team
// We set foundTeams = false to avoid finishing team selection
// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves // once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
// already // already
anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap); anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@ -1665,7 +1659,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("AnyDestOverloaded", anyDestOverloaded) .detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size()) .detail("NumOfTeamCollections", self->teamCollections.size())
.detail("Servers", destServersString(bestTeams)); .detail("Servers", destServersString(bestTeams));
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { if (enableShardMove) {
if (rd.isRestore() && destOverloadedCount > 50) { if (rd.isRestore() && destOverloadedCount > 50) {
throw data_move_dest_team_not_found(); throw data_move_dest_team_not_found();
} }
@ -1689,14 +1683,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team // When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
// However, this may be failed // However, this may be failed
// Any retry triggers to use new physicalShard which enters the normal routine // Any retry triggers to use new physicalShard which enters the normal routine
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { if (enableShardMove) {
forceToUseNewPhysicalShard = true; forceToUseNewPhysicalShard = true;
} }
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves // TODO different trace event + knob for overloaded? Could wait on an async var for done moves
} }
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { if (enableShardMove) {
if (!rd.isRestore()) { if (!rd.isRestore()) {
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate // when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures // thus, update the physicalShardIDCandidate to related data structures
@ -1954,7 +1948,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
self->shardsAffectedByTeamFailure->finishMove(rd.keys); self->shardsAffectedByTeamFailure->finishMove(rd.keys);
relocationComplete.send(rd); relocationComplete.send(rd);
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) { if (enableShardMove) {
// update physical shard collection // update physical shard collection
std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams; std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
for (int i = 0; i < bestTeams.size(); i++) { for (int i = 0; i < bestTeams.size(); i++) {

View File

@ -588,7 +588,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<DDTeamCollection> primaryTeamCollection; state Reference<DDTeamCollection> primaryTeamCollection;
state Reference<DDTeamCollection> remoteTeamCollection; state Reference<DDTeamCollection> remoteTeamCollection;
state bool trackerCancelled; state bool trackerCancelled;
state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED;
loop { loop {
trackerCancelled = false; trackerCancelled = false;
self->initialized = Promise<Void>(); self->initialized = Promise<Void>();
@ -610,7 +609,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false)); state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false)); state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
if (ddIsTenantAware) { if (SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED || SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
self->ddTenantCache = makeReference<TenantCache>(cx, self->ddId); self->ddTenantCache = makeReference<TenantCache>(cx, self->ddId);
wait(self->ddTenantCache.get()->build()); wait(self->ddTenantCache.get()->build());
} }
@ -684,6 +683,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
"DDTenantCacheMonitor", "DDTenantCacheMonitor",
self->ddId, self->ddId,
&normalDDQueueErrors())); &normalDDQueueErrors()));
}
if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
actors.push_back(reportErrorsExcept(self->ddTenantCache.get()->monitorStorageQuota(), actors.push_back(reportErrorsExcept(self->ddTenantCache.get()->monitorStorageQuota(),
"StorageQuotaTracker", "StorageQuotaTracker",
self->ddId, self->ddId,
@ -1320,7 +1321,7 @@ GetStorageWigglerStateReply getStorageWigglerStates(Reference<DataDistributor> s
TenantsOverStorageQuotaReply getTenantsOverStorageQuota(Reference<DataDistributor> self) { TenantsOverStorageQuotaReply getTenantsOverStorageQuota(Reference<DataDistributor> self) {
TenantsOverStorageQuotaReply reply; TenantsOverStorageQuotaReply reply;
if (self->ddTenantCache.present()) { if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
reply.tenants = self->ddTenantCache.get()->getTenantsOverQuota(); reply.tenants = self->ddTenantCache.get()->getTenantsOverQuota();
} }
return reply; return reply;

View File

@ -446,11 +446,14 @@ void proxyGRVThresholdExceeded(const GetReadVersionRequest* req, GrvProxyStats*
++stats->txnRequestErrors; ++stats->txnRequestErrors;
req->reply.sendError(grv_proxy_memory_limit_exceeded()); req->reply.sendError(grv_proxy_memory_limit_exceeded());
if (req->priority == TransactionPriority::IMMEDIATE) { if (req->priority == TransactionPriority::IMMEDIATE) {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededSystem").suppressFor(60); TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededSystem")
.suppressFor(60);
} else if (req->priority == TransactionPriority::DEFAULT) { } else if (req->priority == TransactionPriority::DEFAULT) {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededDefault").suppressFor(60); TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededDefault")
.suppressFor(60);
} else { } else {
TraceEvent(SevWarnAlways, "ProxyGRVThresholdExceededBatch").suppressFor(60); TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "ProxyGRVThresholdExceededBatch")
.suppressFor(60);
} }
} }

View File

@ -58,6 +58,14 @@ void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBand
} }
} }
void GrvProxyTagThrottler::TagQueue::endReleaseWindow(int64_t numStarted, double elapsed) {
if (rateInfo.present()) {
CODE_PROBE(requests.empty(), "Tag queue ending release window with empty request queue");
CODE_PROBE(!requests.empty(), "Tag queue ending release window with requests still queued");
rateInfo.get().endReleaseWindow(numStarted, requests.empty(), elapsed);
}
}
GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration) GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration)
: maxThrottleDuration(maxThrottleDuration), : maxThrottleDuration(maxThrottleDuration),
latencyBandsMap("GrvProxyTagThrottler", latencyBandsMap("GrvProxyTagThrottler",
@ -202,16 +210,14 @@ void GrvProxyTagThrottler::releaseTransactions(double elapsed,
} }
} }
// End release windows for queues with valid rateInfo // End release windows for all tag queues
{ {
TransactionTagMap<uint32_t> transactionsReleasedMap; TransactionTagMap<uint32_t> transactionsReleasedMap;
for (const auto& [tag, count] : transactionsReleased) { for (const auto& [tag, count] : transactionsReleased) {
transactionsReleasedMap[tag] = count; transactionsReleasedMap[tag] = count;
} }
for (auto& [tag, queue] : queues) { for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) { queue.endReleaseWindow(transactionsReleasedMap[tag], elapsed);
queue.rateInfo.get().endReleaseWindow(transactionsReleasedMap[tag], false, elapsed);
}
} }
} }
// If the capacity is increased, that means the vector has been illegally resized, potentially // If the capacity is increased, that means the vector has been illegally resized, potentially
@ -438,3 +444,33 @@ TEST_CASE("/GrvProxyTagThrottler/Fifo") {
wait(mockFifoClient(&throttler)); wait(mockFifoClient(&throttler));
return Void(); return Void();
} }
// Tests that while throughput is low, the tag throttler
// does not accumulate too much budget.
//
// A server is setup to server 10 transactions per second,
// then runs idly for 60 seconds. Then a client starts
// and attempts 20 transactions per second for 60 seconds.
// The server throttles the client to only achieve
// 10 transactions per second during this 60 second window.
// If the throttler is allowed to accumulate budget indefinitely
// during the idle 60 seconds, this test will fail.
TEST_CASE("/GrvProxyTagThrottler/LimitedIdleBudget") {
state GrvProxyTagThrottler throttler(5.0);
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> server = mockServer(&throttler);
wait(delay(60.0));
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 20.0, &counters);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_LimitedIdleBudget").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}

View File

@ -35,7 +35,7 @@ bool GrvTransactionRateInfo::canStart(int64_t numAlreadyStarted, int64_t count)
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START); std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
} }
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) { void GrvTransactionRateInfo::endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used. // Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that // The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that
// elapsed. // elapsed.
@ -52,16 +52,15 @@ void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool
// //
// Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds, // Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds,
// whereas "release window" is the period between wait statements, with duration indicated by "elapsed." // whereas "release window" is the period between wait statements, with duration indicated by "elapsed."
budget = budget = std::max(0.0, budget + elapsed * (limit - numStarted) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward // If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised // If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) { if (queueEmpty) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET); budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
} }
smoothReleased.addDelta(numStartedAtPriority); smoothReleased.addDelta(numStarted);
} }
void GrvTransactionRateInfo::disable() { void GrvTransactionRateInfo::disable() {

View File

@ -391,9 +391,16 @@ struct Counters {
CounterCollection cc; CounterCollection cc;
Counter immediateThrottle; Counter immediateThrottle;
Counter failedToAcquire; Counter failedToAcquire;
Counter deleteKeyReqs;
Counter deleteRangeReqs;
Counter convertedDeleteKeyReqs;
Counter convertedDeleteRangeReqs;
Counters() Counters()
: cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("FailedToAcquire", cc) {} : cc("RocksDBThrottle"), immediateThrottle("ImmediateThrottle", cc), failedToAcquire("FailedToAcquire", cc),
deleteKeyReqs("DeleteKeyRequests", cc), deleteRangeReqs("DeleteRangeRequests", cc),
convertedDeleteKeyReqs("ConvertedDeleteKeyRequests", cc),
convertedDeleteRangeReqs("ConvertedDeleteRangeRequests", cc) {}
}; };
struct ReadIterator { struct ReadIterator {
@ -1934,12 +1941,17 @@ struct RocksDBKeyValueStore : IKeyValueStore {
} }
ASSERT(defaultFdbCF != nullptr); ASSERT(defaultFdbCF != nullptr);
// Number of deletes to rocksdb = counters.deleteKeyReqs + convertedDeleteKeyReqs;
// Number of deleteRanges to rocksdb = counters.deleteRangeReqs - counters.convertedDeleteRangeReqs;
if (keyRange.singleKeyRange()) { if (keyRange.singleKeyRange()) {
writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin)); writeBatch->Delete(defaultFdbCF, toSlice(keyRange.begin));
++counters.deleteKeyReqs;
} else { } else {
++counters.deleteRangeReqs;
if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr && if (SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_ON_CLEARRANGE && storageMetrics != nullptr &&
storageMetrics->byteSample.getEstimate(keyRange) < storageMetrics->byteSample.getEstimate(keyRange) <
SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) { SERVER_KNOBS->ROCKSDB_SINGLEKEY_DELETES_BYTES_LIMIT) {
++counters.convertedDeleteRangeReqs;
rocksdb::ReadOptions options = sharedState->getReadOptions(); rocksdb::ReadOptions options = sharedState->getReadOptions();
auto beginSlice = toSlice(keyRange.begin); auto beginSlice = toSlice(keyRange.begin);
auto endSlice = toSlice(keyRange.end); auto endSlice = toSlice(keyRange.end);
@ -1949,6 +1961,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
cursor->Seek(toSlice(keyRange.begin)); cursor->Seek(toSlice(keyRange.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) { while (cursor->Valid() && toStringRef(cursor->key()) < keyRange.end) {
writeBatch->Delete(defaultFdbCF, cursor->key()); writeBatch->Delete(defaultFdbCF, cursor->key());
++counters.convertedDeleteKeyReqs;
cursor->Next(); cursor->Next();
} }
if (!cursor->status().ok()) { if (!cursor->status().ok()) {
@ -1958,6 +1971,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
auto it = keysSet.lower_bound(keyRange.begin); auto it = keysSet.lower_bound(keyRange.begin);
while (it != keysSet.end() && *it < keyRange.end) { while (it != keysSet.end() && *it < keyRange.end) {
writeBatch->Delete(defaultFdbCF, toSlice(*it)); writeBatch->Delete(defaultFdbCF, toSlice(*it));
++counters.convertedDeleteKeyReqs;
it++; it++;
} }
} }

View File

@ -289,11 +289,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
// Detect conflicts // Detect conflicts
double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME; double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME;
ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena); ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena);
Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS; const Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
newOldestVersion = req.version - std::max(5 * SERVER_KNOBS->VERSIONS_PER_SECOND,
SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS);
}
for (int t = 0; t < req.transactions.size(); t++) { for (int t = 0; t < req.transactions.size(); t++) {
conflictBatch.addTransaction(req.transactions[t], newOldestVersion); conflictBatch.addTransaction(req.transactions[t], newOldestVersion);
self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size(); self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size();

View File

@ -422,11 +422,12 @@ ACTOR static Future<Void> _parsePartitionedLogFileOnLoader(
state LogMessageVersion msgVersion; state LogMessageVersion msgVersion;
msgVersion.version = reader.consumeNetworkUInt64(); msgVersion.version = reader.consumeNetworkUInt64();
msgVersion.sub = reader.consumeNetworkUInt32(); msgVersion.sub = reader.consumeNetworkUInt32();
int msgSize = reader.consumeNetworkInt32(); state int msgSize = reader.consumeNetworkInt32();
const uint8_t* message = reader.consume(msgSize); state const uint8_t* message = reader.consume(msgSize);
// Skip mutations out of the version range // Skip mutations out of the version range
if (!asset.isInVersionRange(msgVersion.version)) { if (!asset.isInVersionRange(msgVersion.version)) {
wait(yield()); // avoid potential stack overflows
continue; continue;
} }

View File

@ -127,25 +127,38 @@ public:
loop { loop {
state double fetchStartTime = now(); state double fetchStartTime = now();
state std::vector<TenantName> tenants = tenantCache->getTenantList(); state std::vector<TenantGroupName> groups;
for (const auto& [group, storage] : tenantCache->tenantStorageMap) {
groups.push_back(group);
}
state int i; state int i;
for (i = 0; i < tenants.size(); i++) { for (i = 0; i < groups.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]); state TenantGroupName group = groups[i];
loop { state int64_t usage = 0;
try { // `tenants` needs to be a copy so that the erase (below) or inserts/erases from other
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys)); // functions (when this actor yields) do not interfere with the iteration
tenantCache->tenantStorageMap[tenants[i]].usage = size; state std::unordered_set<TenantName> tenants = tenantCache->tenantStorageMap[group].tenants;
break; state std::unordered_set<TenantName>::iterator iter = tenants.begin();
} catch (Error& e) { for (; iter != tenants.end(); iter++) {
if (e.code() == error_code_tenant_not_found) { state TenantName tenant = *iter;
tenantCache->tenantStorageMap.erase(tenants[i]); state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenant);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
usage += size;
break; break;
} else { } catch (Error& e) {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e); if (e.code() == error_code_tenant_not_found) {
wait(tr.onError(e)); tenantCache->tenantStorageMap[group].tenants.erase(tenant);
break;
} else {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
} }
} }
} }
tenantCache->tenantStorageMap[group].usage = usage;
} }
lastTenantListFetchTime = now(); lastTenantListFetchTime = now();
@ -162,22 +175,24 @@ public:
state Transaction tr(tenantCache->dbcx()); state Transaction tr(tenantCache->dbcx());
loop { loop {
loop { try {
try { state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY)); // Reset the quota for all groups; this essentially sets the quota to `max` for groups where the
for (auto const kv : currentQuotas) { // quota might have been cleared (i.e., groups that will not be returned in `getRange` request above).
TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix); for (auto& [group, storage] : tenantCache->tenantStorageMap) {
int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned()); storage.quota = std::numeric_limits<int64_t>::max();
tenantCache->tenantStorageMap[tenant].quota = quota;
}
tr.reset();
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
} }
for (const auto kv : currentQuotas) {
const TenantGroupName group = kv.key.removePrefix(storageQuotaPrefix);
const int64_t quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[group].quota = quota;
}
tr.reset();
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
} }
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
} }
} }
}; };
@ -189,6 +204,10 @@ void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
TenantInfo tenantInfo(tenantName, Optional<Standalone<StringRef>>(), tenant.id); TenantInfo tenantInfo(tenantName, Optional<Standalone<StringRef>>(), tenant.id);
tenantCache[tenantPrefix] = makeReference<TCTenantInfo>(tenantInfo, tenant.prefix); tenantCache[tenantPrefix] = makeReference<TCTenantInfo>(tenantInfo, tenant.prefix);
tenantCache[tenantPrefix]->updateCacheGeneration(generation); tenantCache[tenantPrefix]->updateCacheGeneration(generation);
if (tenant.tenantGroup.present()) {
tenantStorageMap[tenant.tenantGroup.get()].tenants.insert(tenantName);
}
} }
void TenantCache::startRefresh() { void TenantCache::startRefresh() {
@ -289,13 +308,13 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
} }
std::unordered_set<TenantName> TenantCache::getTenantsOverQuota() const { std::unordered_set<TenantName> TenantCache::getTenantsOverQuota() const {
std::unordered_set<TenantName> tenants; std::unordered_set<TenantName> tenantsOverQuota;
for (const auto& [tenant, storage] : tenantStorageMap) { for (const auto& [tenantGroup, storage] : tenantStorageMap) {
if (storage.usage > storage.quota) { if (storage.usage > storage.quota) {
tenants.insert(tenant); tenantsOverQuota.insert(storage.tenants.begin(), storage.tenants.end());
} }
} }
return tenants; return tenantsOverQuota;
} }
Future<Void> TenantCache::monitorTenantMap() { Future<Void> TenantCache::monitorTenantMap() {

View File

@ -2025,7 +2025,8 @@ public:
bool memoryOnly, bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider, Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {}) Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS), : keyProvider(keyProvider),
ioLock(makeReference<PriorityMultiLock>(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_IO_PRIORITIES)),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize), pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise), filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) { remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2037,7 +2038,7 @@ public:
// This sets the page cache size for all PageCacheT instances using the same evictor // This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes; pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock; g_redwoodMetrics.ioLock = ioLock.getPtr();
if (!g_redwoodMetricsActor.isValid()) { if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger(); g_redwoodMetricsActor = redwoodMetricsLogger();
} }
@ -2499,7 +2500,7 @@ public:
unsigned int level, unsigned int level,
bool header) { bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority)); state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite; ++g_redwoodMetrics.metric.pagerDiskWrite;
g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason); g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason);
if (self->memoryOnly) { if (self->memoryOnly) {
@ -2779,7 +2780,7 @@ public:
int blockSize, int blockSize,
int64_t offset, int64_t offset,
int priority) { int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority))); state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead; ++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset)); int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes; return bytes;
@ -3593,7 +3594,7 @@ public:
// The next section explicitly cancels all pending operations held in the pager // The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str()); debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill(); self->ioLock->kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel(); self->recoverFuture.cancel();
@ -3802,7 +3803,7 @@ private:
Reference<IPageEncryptionKeyProvider> keyProvider; Reference<IPageEncryptionKeyProvider> keyProvider;
PriorityMultiLock ioLock; Reference<PriorityMultiLock> ioLock;
int64_t pageCacheBytes; int64_t pageCacheBytes;
@ -8894,32 +8895,25 @@ void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
int maxPriority = ioLock->maxPriority(); int maxPriority = ioLock->maxPriority();
if (e != nullptr) { if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners()); e->detail("IOActiveTotal", ioLock->getRunnersCount());
e->detail("AwaitReads", ioLock->totalWaiters()); e->detail("IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) { for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority)); e->detail(format("IOActiveP%d", priority), ioLock->getRunnersCount(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority)); e->detail(format("IOWaitingP%d", priority), ioLock->getWaitersCount(priority));
} }
} }
if (s != nullptr) { if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n"; *s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners()); *s += format("%-15s %-8u ", "IOActiveTotal", ioLock->getRunnersCount());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
for (int priority = 0; priority <= maxPriority; ++priority) { for (int priority = 0; priority <= maxPriority; ++priority) {
*s += *s += format("IOActiveP%-6d %-8u ", priority, ioLock->getRunnersCount(priority));
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
} }
*s += "\n"; *s += "\n";
*s += format("%-15s %-8u ", "IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) { for (int priority = 0; priority <= maxPriority; ++priority) {
*s += *s += format("IOWaitingP%-5d %-8u ", priority, ioLock->getWaitersCount(priority));
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
} }
} }
} }
@ -11407,57 +11401,3 @@ TEST_CASE(":/redwood/performance/histograms") {
return Void(); return Void();
} }
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}

View File

@ -162,10 +162,7 @@ ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> b
ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn); ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn); ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider> blobConn); ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef range);
inline bool isFullRestoreMode() {
return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
};
#include "flow/unactorcompiler.h" #include "flow/unactorcompiler.h"

View File

@ -30,6 +30,7 @@
struct BlobMigratorInterface { struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199; constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator; RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality; LocalityData locality;
UID uniqueID; UID uniqueID;
StorageServerInterface ssi; StorageServerInterface ssi;
@ -48,7 +49,7 @@ struct BlobMigratorInterface {
template <class Archive> template <class Archive>
void serialize(Archive& ar) { void serialize(Archive& ar) {
serializer(ar, locality, uniqueID, haltBlobMigrator); serializer(ar, locality, uniqueID, haltBlobMigrator, waitFailure);
} }
}; };

View File

@ -144,6 +144,7 @@ public:
Future<Void> clientCounter; Future<Void> clientCounter;
int clientCount; int clientCount;
AsyncVar<bool> blobGranulesEnabled; AsyncVar<bool> blobGranulesEnabled;
AsyncVar<bool> blobRestoreEnabled;
ClusterType clusterType = ClusterType::STANDALONE; ClusterType clusterType = ClusterType::STANDALONE;
Optional<ClusterName> metaclusterName; Optional<ClusterName> metaclusterName;
Optional<MetaclusterRegistrationEntry> metaclusterRegistration; Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
@ -159,7 +160,7 @@ public:
TaskPriority::DefaultEndpoint, TaskPriority::DefaultEndpoint,
LockAware::True)), // SOMEDAY: Locality! LockAware::True)), // SOMEDAY: Locality!
unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientCount(0), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientCount(0),
blobGranulesEnabled(config.blobGranulesEnabled) { blobGranulesEnabled(config.blobGranulesEnabled), blobRestoreEnabled(false) {
clientCounter = countClients(this); clientCounter = countClients(this);
} }

View File

@ -60,6 +60,7 @@ class GrvProxyTagThrottler {
void setRate(double rate); void setRate(double rate);
bool isMaxThrottled(double maxThrottleDuration) const; bool isMaxThrottled(double maxThrottleDuration) const;
void rejectRequests(LatencyBandsMap&); void rejectRequests(LatencyBandsMap&);
void endReleaseWindow(int64_t numStarted, double elapsed);
}; };
// Track the budgets for each tag // Track the budgets for each tag

View File

@ -55,7 +55,7 @@ public:
// Updates the budget to accumulate any extra capacity available or remove any excess that was used. // Updates the budget to accumulate any extra capacity available or remove any excess that was used.
// Call at the end of a release window. // Call at the end of a release window.
void endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed); void endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed);
// Smoothly sets rate. If currently disabled, reenable // Smoothly sets rate. If currently disabled, reenable
void setRate(double rate); void setRate(double rate);

View File

@ -35,8 +35,9 @@ typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;
struct Storage { struct Storage {
int64_t quota = std::numeric_limits<int64_t>::max(); int64_t quota = std::numeric_limits<int64_t>::max();
int64_t usage = 0; int64_t usage = 0;
std::unordered_set<TenantName> tenants;
}; };
typedef std::unordered_map<TenantName, Storage> TenantStorageMap; typedef std::unordered_map<TenantGroupName, Storage> TenantStorageMap;
struct TenantCacheTenantCreated { struct TenantCacheTenantCreated {
KeyRange keys; KeyRange keys;
@ -56,7 +57,8 @@ private:
uint64_t generation; uint64_t generation;
TenantMapByPrefix tenantCache; TenantMapByPrefix tenantCache;
// Map from tenant names to storage quota and usage // Map from tenant group names to the list of tenants, cumumlative storage used by
// all the tenants in the group, and its storage quota.
TenantStorageMap tenantStorageMap; TenantStorageMap tenantStorageMap;
// mark the start of a new sweep of the tenant cache // mark the start of a new sweep of the tenant cache

View File

@ -435,6 +435,7 @@ struct StorageServerDisk {
// The following are pointers to the Counters in StorageServer::counters of the same names. // The following are pointers to the Counters in StorageServer::counters of the same names.
Counter* kvCommitLogicalBytes; Counter* kvCommitLogicalBytes;
Counter* kvClearRanges; Counter* kvClearRanges;
Counter* kvClearSingleKey;
Counter* kvGets; Counter* kvGets;
Counter* kvScans; Counter* kvScans;
Counter* kvCommits; Counter* kvCommits;
@ -1109,15 +1110,13 @@ public:
FlowLock serveFetchCheckpointParallelismLock; FlowLock serveFetchCheckpointParallelismLock;
PriorityMultiLock ssLock; Reference<PriorityMultiLock> ssLock;
std::vector<int> readPriorityRanks; std::vector<int> readPriorityRanks;
Future<PriorityMultiLock::Lock> getReadLock(const Optional<ReadOptions>& options) { Future<PriorityMultiLock::Lock> getReadLock(const Optional<ReadOptions>& options) {
// TODO: Fix perf regression in 100% cache read case where taking this lock adds too much overhead int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
return PriorityMultiLock::Lock(); readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
// int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL); return ssLock->lock(readPriorityRanks[readType]);
// readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
// return ssLock.lock(readPriorityRanks[readType]);
} }
FlowLock serveAuditStorageParallelismLock; FlowLock serveAuditStorageParallelismLock;
@ -1172,6 +1171,8 @@ public:
Counter kvCommitLogicalBytes; Counter kvCommitLogicalBytes;
// Count of all clearRange operatons to the storage engine. // Count of all clearRange operatons to the storage engine.
Counter kvClearRanges; Counter kvClearRanges;
// Count of all clearRange operations on a singlekeyRange(key delete) to the storage engine.
Counter kvClearSingleKey;
// ClearRange operations issued by FDB, instead of from users, e.g., ClearRange operations to remove a shard // ClearRange operations issued by FDB, instead of from users, e.g., ClearRange operations to remove a shard
// from a storage server, as in removeDataRange(). // from a storage server, as in removeDataRange().
Counter kvSystemClearRanges; Counter kvSystemClearRanges;
@ -1247,8 +1248,8 @@ public:
feedVersionQueries("FeedVersionQueries", cc), bytesInput("BytesInput", cc), feedVersionQueries("FeedVersionQueries", cc), bytesInput("BytesInput", cc),
logicalBytesInput("LogicalBytesInput", cc), logicalBytesMoveInOverhead("LogicalBytesMoveInOverhead", cc), logicalBytesInput("LogicalBytesInput", cc), logicalBytesMoveInOverhead("LogicalBytesMoveInOverhead", cc),
kvCommitLogicalBytes("KVCommitLogicalBytes", cc), kvClearRanges("KVClearRanges", cc), kvCommitLogicalBytes("KVCommitLogicalBytes", cc), kvClearRanges("KVClearRanges", cc),
kvSystemClearRanges("KVSystemClearRanges", cc), bytesDurable("BytesDurable", cc), kvClearSingleKey("KVClearSingleKey", cc), kvSystemClearRanges("KVSystemClearRanges", cc),
bytesFetched("BytesFetched", cc), mutationBytes("MutationBytes", cc), bytesDurable("BytesDurable", cc), bytesFetched("BytesFetched", cc), mutationBytes("MutationBytes", cc),
feedBytesFetched("FeedBytesFetched", cc), sampledBytesCleared("SampledBytesCleared", cc), feedBytesFetched("FeedBytesFetched", cc), sampledBytesCleared("SampledBytesCleared", cc),
kvFetched("KVFetched", cc), mutations("Mutations", cc), setMutations("SetMutations", cc), kvFetched("KVFetched", cc), mutations("Mutations", cc), setMutations("SetMutations", cc),
clearRangeMutations("ClearRangeMutations", cc), atomicMutations("AtomicMutations", cc), clearRangeMutations("ClearRangeMutations", cc), atomicMutations("AtomicMutations", cc),
@ -1404,7 +1405,8 @@ public:
fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL), fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
ssLock(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES), ssLock(makeReference<PriorityMultiLock>(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY,
SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES)),
serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM), serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM),
instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false), instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0), versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0),
@ -1412,7 +1414,7 @@ public:
busiestWriteTagContext(ssi.id()), counters(this), busiestWriteTagContext(ssi.id()), counters(this),
storageServerSourceTLogIDEventHolder( storageServerSourceTLogIDEventHolder(
makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) { makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ','); readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READTYPE_PRIORITY_MAP, ',');
ASSERT(readPriorityRanks.size() > (int)ReadType::MAX); ASSERT(readPriorityRanks.size() > (int)ReadType::MAX);
version.initMetric("StorageServer.Version"_sr, counters.cc.getId()); version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId()); oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
@ -1431,6 +1433,7 @@ public:
this->storage.kvCommitLogicalBytes = &counters.kvCommitLogicalBytes; this->storage.kvCommitLogicalBytes = &counters.kvCommitLogicalBytes;
this->storage.kvClearRanges = &counters.kvClearRanges; this->storage.kvClearRanges = &counters.kvClearRanges;
this->storage.kvClearSingleKey = &counters.kvClearSingleKey;
this->storage.kvGets = &counters.kvGets; this->storage.kvGets = &counters.kvGets;
this->storage.kvScans = &counters.kvScans; this->storage.kvScans = &counters.kvScans;
this->storage.kvCommits = &counters.kvCommits; this->storage.kvCommits = &counters.kvCommits;
@ -4762,7 +4765,6 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
Arena* pArena, Arena* pArena,
int matchIndex, int matchIndex,
bool isRangeQuery, bool isRangeQuery,
bool isBoundary,
KeyValueRef* it, KeyValueRef* it,
MappedKeyValueRef* kvm, MappedKeyValueRef* kvm,
Key mappedKey) { Key mappedKey) {
@ -4770,31 +4772,42 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
// Use the mappedKey as the prefix of the range query. // Use the mappedKey as the prefix of the range query.
GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq)); GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq));
if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) || if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) ||
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) { (getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY) || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key; kvm->key = it->key;
kvm->value = it->value; kvm->value = it->value;
} }
kvm->boundaryAndExist = isBoundary && !getRange.result.empty();
kvm->reqAndResult = getRange; kvm->reqAndResult = getRange;
} else { } else {
GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq)); GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq));
kvm->reqAndResult = getValue; kvm->reqAndResult = getValue;
kvm->boundaryAndExist = isBoundary && getValue.result.present();
} }
return Void(); return Void();
} }
int getMappedKeyValueSize(MappedKeyValueRef mappedKeyValue) {
auto& reqAndResult = mappedKeyValue.reqAndResult;
int bytes = 0;
if (std::holds_alternative<GetValueReqAndResultRef>(reqAndResult)) {
const auto& getValue = std::get<GetValueReqAndResultRef>(reqAndResult);
bytes = getValue.expectedSize();
} else if (std::holds_alternative<GetRangeReqAndResultRef>(reqAndResult)) {
const auto& getRange = std::get<GetRangeReqAndResultRef>(reqAndResult);
bytes = getRange.result.expectedSize();
} else {
throw internal_error();
}
return bytes;
}
ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data, ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
GetKeyValuesReply input, GetKeyValuesReply input,
StringRef mapper, StringRef mapper,
// To provide span context, tags, debug ID to underlying lookups. // To provide span context, tags, debug ID to underlying lookups.
GetMappedKeyValuesRequest* pOriginalReq, GetMappedKeyValuesRequest* pOriginalReq,
Optional<Key> tenantPrefix, int matchIndex,
int matchIndex) { int* remainingLimitBytes) {
state GetMappedKeyValuesReply result; state GetMappedKeyValuesReply result;
result.version = input.version; result.version = input.version;
result.more = input.more;
result.cached = input.cached; result.cached = input.cached;
result.arena.dependsOn(input.arena); result.arena.dependsOn(input.arena);
@ -4823,22 +4836,15 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
g_traceBatch.addEvent("TransactionDebug", g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(), pOriginalReq->options.get().debugID.get().first(),
"storageserver.mapKeyValues.BeforeLoop"); "storageserver.mapKeyValues.BeforeLoop");
for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
for (; offset<sz&& * remainingLimitBytes> 0; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
// Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries // Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) { for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
KeyValueRef* it = &input.data[i + offset]; KeyValueRef* it = &input.data[i + offset];
MappedKeyValueRef* kvm = &kvms[i]; MappedKeyValueRef* kvm = &kvms[i];
bool isBoundary = (i + offset) == 0 || (i + offset) == sz - 1; // Clear key value to the default.
// need to keep the boundary, so that caller can use it as a continuation. kvm->key = ""_sr;
if (isBoundary || matchIndex == MATCH_INDEX_ALL) { kvm->value = ""_sr;
kvm->key = it->key;
kvm->value = it->value;
} else {
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
}
Key mappedKey = constructMappedKey(it, vt, mappedKeyFormatTuple); Key mappedKey = constructMappedKey(it, vt, mappedKeyFormatTuple);
// Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously. // Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously.
result.arena.dependsOn(mappedKey.arena()); result.arena.dependsOn(mappedKey.arena());
@ -4846,16 +4852,8 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
// std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value) // std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value)
// << ", mappedKey:" << printable(mappedKey) << std::endl; // << ", mappedKey:" << printable(mappedKey) << std::endl;
subqueries.push_back(mapSubquery(data, subqueries.push_back(mapSubquery(
input.version, data, input.version, pOriginalReq, &result.arena, matchIndex, isRangeQuery, it, kvm, mappedKey));
pOriginalReq,
&result.arena,
matchIndex,
isRangeQuery,
isBoundary,
it,
kvm,
mappedKey));
} }
wait(waitForAll(subqueries)); wait(waitForAll(subqueries));
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present()) if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
@ -4864,9 +4862,31 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
"storageserver.mapKeyValues.AfterBatch"); "storageserver.mapKeyValues.AfterBatch");
subqueries.clear(); subqueries.clear();
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) { for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
// since we always read the index, so always consider the index size
int indexSize = sizeof(KeyValueRef) + input.data[i + offset].expectedSize();
int size = indexSize + getMappedKeyValueSize(kvms[i]);
*remainingLimitBytes -= size;
result.data.push_back(result.arena, kvms[i]); result.data.push_back(result.arena, kvms[i]);
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT && *remainingLimitBytes <= 0) {
break;
}
} }
} }
int resultSize = result.data.size();
if (resultSize > 0) {
// keep index for boundary index entries, so that caller can use it as a continuation.
result.data[0].key = input.data[0].key;
result.data[0].value = input.data[0].value;
result.data[0].boundaryAndExist = getMappedKeyValueSize(kvms[0]) > 0;
result.data.back().key = input.data[resultSize - 1].key;
result.data.back().value = input.data[resultSize - 1].value;
// index needs to be -1
int index = (resultSize - 1) % SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
result.data.back().boundaryAndExist = getMappedKeyValueSize(kvms[index]) > 0;
}
result.more = input.more || resultSize < sz;
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present()) if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
g_traceBatch.addEvent("TransactionDebug", g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(), pOriginalReq->options.get().debugID.get().first(),
@ -5121,12 +5141,15 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
req.reply.send(none); req.reply.send(none);
} else { } else {
state int remainingLimitBytes = req.limitBytes; state int remainingLimitBytes = req.limitBytes;
// create a temporary byte limit for index fetching ONLY, this should be excessive
// because readRange is cheap when reading additional bytes
state int bytesForIndex =
std::min(req.limitBytes, (int)(req.limitBytes * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH));
GetKeyValuesReply getKeyValuesReply = wait(readRange(data, GetKeyValuesReply getKeyValuesReply = wait(readRange(data,
version, version,
KeyRangeRef(begin, end), KeyRangeRef(begin, end),
req.limit, req.limit,
&remainingLimitBytes, &bytesForIndex,
span.context, span.context,
req.options, req.options,
tenantPrefix)); tenantPrefix));
@ -5140,9 +5163,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
try { try {
// Map the scanned range to another list of keys and look up. // Map the scanned range to another list of keys and look up.
GetMappedKeyValuesReply _r = GetMappedKeyValuesReply _r =
wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix, req.matchIndex)); wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, req.matchIndex, &remainingLimitBytes));
r = _r; r = _r;
} catch (Error& e) { } catch (Error& e) {
// catch txn_too_old here if prefetch runs for too long, and returns it back to client
TraceEvent("MapError").error(e); TraceEvent("MapError").error(e);
throw; throw;
} }
@ -6138,6 +6162,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> tryReadBlobGranules(Tra
loop { loop {
try { try {
Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tr->readBlobGranules(keys, 0, readVersion)); Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tr->readBlobGranules(keys, 0, readVersion));
TraceEvent(SevDebug, "ReadBlobGranules").detail("Keys", keys).detail("Chunks", chunks.size());
return chunks; return chunks;
} catch (Error& e) { } catch (Error& e) {
if (retryCount >= maxRetryCount) { if (retryCount >= maxRetryCount) {
@ -6169,10 +6194,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
for (i = 0; i < chunks.size(); ++i) { for (i = 0; i < chunks.size(); ++i) {
state KeyRangeRef chunkRange = chunks[i].keyRange; state KeyRangeRef chunkRange = chunks[i].keyRange;
state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn)); state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn));
TraceEvent("ReadBlobData") TraceEvent(SevDebug, "ReadBlobData").detail("Rows", rows.size()).detail("ChunkRange", chunkRange);
.detail("Rows", rows.size())
.detail("ChunkRange", chunkRange.toString())
.detail("Keys", keys.toString());
if (rows.size() == 0) { if (rows.size() == 0) {
rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end)); rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end));
} }
@ -6185,7 +6207,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
} catch (Error& e) { } catch (Error& e) {
TraceEvent(SevWarn, "ReadBlobDataFailure") TraceEvent(SevWarn, "ReadBlobDataFailure")
.suppressFor(5.0) .suppressFor(5.0)
.detail("Keys", keys.toString()) .detail("Keys", keys)
.detail("FetchVersion", fetchVersion) .detail("FetchVersion", fetchVersion)
.detail("Error", e.what()); .detail("Error", e.what());
tr->reset(); tr->reset();
@ -6994,7 +7016,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
// We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure // We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure
// change feed mutations get applied correctly // change feed mutations get applied correctly
state std::vector<Key> changeFeedsToFetch; state std::vector<Key> changeFeedsToFetch;
if (!isFullRestoreMode()) { state bool isFullRestore = wait(isFullRestoreMode(data->cx, keys));
if (!isFullRestore) {
std::vector<Key> _cfToFetch = wait(fetchCFMetadata); std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
changeFeedsToFetch = _cfToFetch; changeFeedsToFetch = _cfToFetch;
} }
@ -7072,7 +7095,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
state PromiseStream<RangeResult> results; state PromiseStream<RangeResult> results;
state Future<Void> hold; state Future<Void> hold;
if (SERVER_KNOBS->FETCH_USING_BLOB) { if (isFullRestore) {
hold = tryGetRangeFromBlob(results, &tr, keys, fetchVersion, data->blobConn); hold = tryGetRangeFromBlob(results, &tr, keys, fetchVersion, data->blobConn);
} else { } else {
hold = tryGetRange(results, &tr, keys); hold = tryGetRange(results, &tr, keys);
@ -7110,7 +7133,6 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
data->thisServerID); data->thisServerID);
} }
} }
metricReporter.addFetchedBytes(expectedBlockSize, this_block.size()); metricReporter.addFetchedBytes(expectedBlockSize, this_block.size());
// Write this_block to storage // Write this_block to storage
@ -9703,6 +9725,9 @@ void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned)
void StorageServerDisk::clearRange(KeyRangeRef keys) { void StorageServerDisk::clearRange(KeyRangeRef keys) {
storage->clear(keys, &data->metrics); storage->clear(keys, &data->metrics);
++(*kvClearRanges); ++(*kvClearRanges);
if (keys.singleKeyRange()) {
++(*kvClearSingleKey);
}
} }
void StorageServerDisk::writeKeyValue(KeyValueRef kv) { void StorageServerDisk::writeKeyValue(KeyValueRef kv) {
@ -9717,6 +9742,9 @@ void StorageServerDisk::writeMutation(MutationRef mutation) {
} else if (mutation.type == MutationRef::ClearRange) { } else if (mutation.type == MutationRef::ClearRange) {
storage->clear(KeyRangeRef(mutation.param1, mutation.param2), &data->metrics); storage->clear(KeyRangeRef(mutation.param1, mutation.param2), &data->metrics);
++(*kvClearRanges); ++(*kvClearRanges);
if (KeyRangeRef(mutation.param1, mutation.param2).singleKeyRange()) {
++(*kvClearSingleKey);
}
} else } else
ASSERT(false); ASSERT(false);
} }
@ -9732,6 +9760,9 @@ void StorageServerDisk::writeMutations(const VectorRef<MutationRef>& mutations,
} else if (m.type == MutationRef::ClearRange) { } else if (m.type == MutationRef::ClearRange) {
storage->clear(KeyRangeRef(m.param1, m.param2), &data->metrics); storage->clear(KeyRangeRef(m.param1, m.param2), &data->metrics);
++(*kvClearRanges); ++(*kvClearRanges);
if (KeyRangeRef(m.param1, m.param2).singleKeyRange()) {
++(*kvClearSingleKey);
}
} }
} }
} }
@ -10399,20 +10430,20 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString()); te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString()); te.detail("Tag", self->tag.toString());
std::vector<int> rpr = self->readPriorityRanks; std::vector<int> rpr = self->readPriorityRanks;
te.detail("ReadsActive", self->ssLock.totalRunners()); te.detail("ReadsTotalActive", self->ssLock->getRunnersCount());
te.detail("ReadsWaiting", self->ssLock.totalWaiters()); te.detail("ReadsTotalWaiting", self->ssLock->getWaitersCount());
int type = (int)ReadType::FETCH; int type = (int)ReadType::FETCH;
te.detail("ReadFetchActive", self->ssLock.numRunners(rpr[type])); te.detail("ReadFetchActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock.numWaiters(rpr[type])); te.detail("ReadFetchWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::LOW; type = (int)ReadType::LOW;
te.detail("ReadLowActive", self->ssLock.numRunners(rpr[type])); te.detail("ReadLowActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock.numWaiters(rpr[type])); te.detail("ReadLowWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::NORMAL; type = (int)ReadType::NORMAL;
te.detail("ReadNormalActive", self->ssLock.numRunners(rpr[type])); te.detail("ReadNormalActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock.numWaiters(rpr[type])); te.detail("ReadNormalWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::HIGH; type = (int)ReadType::HIGH;
te.detail("ReadHighActive", self->ssLock.numRunners(rpr[type])); te.detail("ReadHighActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock.numWaiters(rpr[type])); te.detail("ReadHighWaiting", self->ssLock->getWaitersCount(rpr[type]));
StorageBytes sb = self->storage.getStorageBytes(); StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used); te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free); te.detail("KvstoreBytesFree", sb.free);
@ -11228,7 +11259,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
// If the storage server dies while something that uses self is still on the stack, // If the storage server dies while something that uses self is still on the stack,
// we want that actor to complete before we terminate and that memory goes out of scope // we want that actor to complete before we terminate and that memory goes out of scope
self.ssLock.kill(); self.ssLock->kill();
state Error err = e; state Error err = e;
if (storageServerTerminated(self, persistentData, err)) { if (storageServerTerminated(self, persistentData, err)) {
@ -11326,7 +11357,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
throw internal_error(); throw internal_error();
} catch (Error& e) { } catch (Error& e) {
self.ssLock.kill(); self.ssLock->kill();
if (self.byteSampleRecovery.isValid()) { if (self.byteSampleRecovery.isValid()) {
self.byteSampleRecovery.cancel(); self.byteSampleRecovery.cancel();

View File

@ -2335,6 +2335,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
} else { } else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id()); startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.haltBlobMigrator); DUMPTOKEN(recruited.haltBlobMigrator);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.ssi.getValue); DUMPTOKEN(recruited.ssi.getValue);
DUMPTOKEN(recruited.ssi.getKey); DUMPTOKEN(recruited.ssi.getKey);
DUMPTOKEN(recruited.ssi.getKeyValues); DUMPTOKEN(recruited.ssi.getKeyValues);
@ -2345,7 +2346,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
DUMPTOKEN(recruited.ssi.getReadHotRanges); DUMPTOKEN(recruited.ssi.getReadHotRanges);
DUMPTOKEN(recruited.ssi.getRangeSplitPoints); DUMPTOKEN(recruited.ssi.getRangeSplitPoints);
DUMPTOKEN(recruited.ssi.getStorageMetrics); DUMPTOKEN(recruited.ssi.getStorageMetrics);
DUMPTOKEN(recruited.ssi.waitFailure);
DUMPTOKEN(recruited.ssi.getQueuingMetrics); DUMPTOKEN(recruited.ssi.getQueuingMetrics);
DUMPTOKEN(recruited.ssi.getKeyValueStoreType); DUMPTOKEN(recruited.ssi.getKeyValueStoreType);
DUMPTOKEN(recruited.ssi.watchValue); DUMPTOKEN(recruited.ssi.watchValue);

View File

@ -20,7 +20,9 @@
#include <cstdint> #include <cstdint>
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h" #include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
@ -28,9 +30,13 @@
struct CreateTenantWorkload : TestWorkload { struct CreateTenantWorkload : TestWorkload {
static constexpr auto NAME = "CreateTenant"; static constexpr auto NAME = "CreateTenant";
TenantName tenant; TenantName tenant;
Optional<TenantGroupName> tenantGroup;
CreateTenantWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { CreateTenantWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
tenant = getOption(options, "name"_sr, "DefaultTenant"_sr); tenant = getOption(options, "name"_sr, "DefaultTenant"_sr);
if (hasOption(options, "group"_sr)) {
tenantGroup = getOption(options, "group"_sr, "DefaultGroup"_sr);
}
} }
Future<Void> setup(Database const& cx) override { Future<Void> setup(Database const& cx) override {
@ -46,7 +52,12 @@ struct CreateTenantWorkload : TestWorkload {
ACTOR static Future<Void> _setup(CreateTenantWorkload* self, Database db) { ACTOR static Future<Void> _setup(CreateTenantWorkload* self, Database db) {
try { try {
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant)); TenantMapEntry givenEntry;
if (self->tenantGroup.present()) {
givenEntry.tenantGroup = self->tenantGroup.get();
givenEntry.encrypted = SERVER_KNOBS->ENABLE_ENCRYPTION;
}
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant, givenEntry));
ASSERT(entry.present()); ASSERT(entry.present());
} catch (Error& e) { } catch (Error& e) {
TraceEvent(SevError, "TenantCreationFailed").error(e); TraceEvent(SevError, "TenantCreationFailed").error(e);

View File

@ -38,6 +38,8 @@ const KeyRef prefix = "prefix"_sr;
const KeyRef RECORD = "RECORD"_sr; const KeyRef RECORD = "RECORD"_sr;
const KeyRef INDEX = "INDEX"_sr; const KeyRef INDEX = "INDEX"_sr;
int recordSize;
int indexSize;
struct GetMappedRangeWorkload : ApiWorkload { struct GetMappedRangeWorkload : ApiWorkload {
static constexpr auto NAME = "GetMappedRange"; static constexpr auto NAME = "GetMappedRange";
bool enabled; bool enabled;
@ -93,19 +95,32 @@ struct GetMappedRangeWorkload : ApiWorkload {
loop { loop {
std::cout << "start fillInRecords n=" << n << std::endl; std::cout << "start fillInRecords n=" << n << std::endl;
// TODO: When n is large, split into multiple transactions. // TODO: When n is large, split into multiple transactions.
recordSize = 0;
indexSize = 0;
try { try {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (self->SPLIT_RECORDS) { if (self->SPLIT_RECORDS) {
for (int split = 0; split < SPLIT_SIZE; split++) { for (int split = 0; split < SPLIT_SIZE; split++) {
tr.set(recordKey(i, split), recordValue(i, split)); tr.set(recordKey(i, split), recordValue(i, split));
if (i == 0) {
recordSize +=
recordKey(i, split).size() + recordValue(i, split).size() + sizeof(KeyValueRef);
}
} }
} else { } else {
tr.set(recordKey(i), recordValue(i)); tr.set(recordKey(i), recordValue(i));
if (i == 0) {
recordSize += recordKey(i).size() + recordValue(i).size() + sizeof(KeyValueRef);
}
} }
tr.set(indexEntryKey(i), EMPTY); tr.set(indexEntryKey(i), EMPTY);
if (i == 0) {
indexSize += indexEntryKey(i).size() + sizeof(KeyValueRef);
}
} }
wait(tr.commit()); wait(tr.commit());
std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << std::endl; std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << " recordSize "
<< recordSize << " indexSize " << indexSize << std::endl;
break; break;
} catch (Error& e) { } catch (Error& e) {
std::cout << "failed fillInRecords, retry" << std::endl; std::cout << "failed fillInRecords, retry" << std::endl;
@ -146,8 +161,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
int matchIndex, int matchIndex,
bool isBoundary, bool isBoundary,
bool allMissing) { bool allMissing) {
// std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key) << " // std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key)
// indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId)) << std::endl; // << " indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId))
// << " matchIndex: " << matchIndex << std::endl;
if (matchIndex == MATCH_INDEX_ALL || isBoundary) { if (matchIndex == MATCH_INDEX_ALL || isBoundary) {
ASSERT(it->key == indexEntryKey(expectedId)); ASSERT(it->key == indexEntryKey(expectedId));
} else if (matchIndex == MATCH_INDEX_MATCHED_ONLY) { } else if (matchIndex == MATCH_INDEX_MATCHED_ONLY) {
@ -163,7 +179,6 @@ struct GetMappedRangeWorkload : ApiWorkload {
ASSERT(std::holds_alternative<GetRangeReqAndResultRef>(it->reqAndResult)); ASSERT(std::holds_alternative<GetRangeReqAndResultRef>(it->reqAndResult));
auto& getRange = std::get<GetRangeReqAndResultRef>(it->reqAndResult); auto& getRange = std::get<GetRangeReqAndResultRef>(it->reqAndResult);
auto& rangeResult = getRange.result; auto& rangeResult = getRange.result;
ASSERT(it->boundaryAndExist == (isBoundary && !rangeResult.empty()));
// std::cout << "rangeResult.size()=" << rangeResult.size() << std::endl; // std::cout << "rangeResult.size()=" << rangeResult.size() << std::endl;
// In the future, we may be able to do the continuation more efficiently by combining partial results // In the future, we may be able to do the continuation more efficiently by combining partial results
// together and then validate. // together and then validate.
@ -200,6 +215,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
KeySelector endSelector, KeySelector endSelector,
Key mapper, Key mapper,
int limit, int limit,
int byteLimit,
int expectedBeginId, int expectedBeginId,
GetMappedRangeWorkload* self, GetMappedRangeWorkload* self,
int matchIndex, int matchIndex,
@ -207,14 +223,16 @@ struct GetMappedRangeWorkload : ApiWorkload {
std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString() std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString()
<< " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId << " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId
<< " limit:" << limit << std::endl; << " limit:" << limit << " byteLimit: " << byteLimit << " recordSize: " << recordSize
<< " STRICTLY_ENFORCE_BYTE_LIMIT: " << SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT << " allMissing "
<< allMissing << std::endl;
loop { loop {
state Reference<TransactionWrapper> tr = self->createTransaction(); state Reference<TransactionWrapper> tr = self->createTransaction();
try { try {
MappedRangeResult result = wait(tr->getMappedRange(beginSelector, MappedRangeResult result = wait(tr->getMappedRange(beginSelector,
endSelector, endSelector,
mapper, mapper,
GetRangeLimits(limit), GetRangeLimits(limit, byteLimit),
matchIndex, matchIndex,
self->snapshot, self->snapshot,
Reverse::False)); Reverse::False));
@ -270,17 +288,51 @@ struct GetMappedRangeWorkload : ApiWorkload {
Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone(); Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple)); state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
state int limit = 100; state int limit = 100;
state int byteLimit = deterministicRandom()->randomInt(1, 9) * 10000;
state int expectedBeginId = beginId; state int expectedBeginId = beginId;
std::cout << "ByteLimit: " << byteLimit << " limit: " << limit
<< " FRACTION_INDEX_BYTELIMIT_PREFETCH: " << SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH
<< " MAX_PARALLEL_QUICK_GET_VALUE: " << SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE << std::endl;
while (true) { while (true) {
MappedRangeResult result = wait(self->scanMappedRangeWithLimits( MappedRangeResult result = wait(self->scanMappedRangeWithLimits(cx,
cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self, matchIndex, allMissing)); beginSelector,
endSelector,
mapper,
limit,
byteLimit,
expectedBeginId,
self,
matchIndex,
allMissing));
expectedBeginId += result.size(); expectedBeginId += result.size();
if (result.more) { if (result.more) {
if (result.empty()) { if (result.empty()) {
// This is usually not expected. // This is usually not expected.
std::cout << "not result but have more, try again" << std::endl; std::cout << "not result but have more, try again" << std::endl;
} else { } else {
// auto& reqAndResult = std::get<GetRangeReqAndResultRef>(result.back().reqAndResult); int size = allMissing ? indexSize : (indexSize + recordSize);
int expectedCnt = limit;
int indexByteLimit = byteLimit * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH;
int indexCountByteLimit = indexByteLimit / indexSize + (indexByteLimit % indexSize != 0);
int indexCount = std::min(limit, indexCountByteLimit);
std::cout << "indexCount: " << indexCount << std::endl;
// result set cannot be larger than the number of index fetched
ASSERT(result.size() <= indexCount);
expectedCnt = std::min(expectedCnt, indexCount);
int boundByRecord;
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT) {
// might have 1 additional entry over the limit
boundByRecord = byteLimit / size + (byteLimit % size != 0);
} else {
// might have 1 additional batch over the limit
int roundSize = size * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
int round = byteLimit / roundSize + (byteLimit % roundSize != 0);
boundByRecord = round * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
}
expectedCnt = std::min(expectedCnt, boundByRecord);
std::cout << "boundByRecord: " << boundByRecord << std::endl;
ASSERT(result.size() == expectedCnt);
beginSelector = KeySelector(firstGreaterThan(result.back().key)); beginSelector = KeySelector(firstGreaterThan(result.back().key));
} }
} else { } else {
@ -289,6 +341,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
} }
} }
ASSERT(expectedBeginId == endId); ASSERT(expectedBeginId == endId);
return Void(); return Void();
} }
@ -433,6 +486,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
} else if (r < 0.75) { } else if (r < 0.75) {
matchIndex = MATCH_INDEX_UNMATCHED_ONLY; matchIndex = MATCH_INDEX_UNMATCHED_ONLY;
} }
state bool originalStrictlyEnforeByteLimit = SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT;
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
wait(self->scanMappedRange(cx, 10, 490, mapper, self, matchIndex)); wait(self->scanMappedRange(cx, 10, 490, mapper, self, matchIndex));
{ {
@ -440,6 +495,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
wait(self->scanMappedRange(cx, 10, 490, mapper, self, MATCH_INDEX_UNMATCHED_ONLY, true)); wait(self->scanMappedRange(cx, 10, 490, mapper, self, MATCH_INDEX_UNMATCHED_ONLY, true));
} }
// reset it to default
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = originalStrictlyEnforeByteLimit;
return Void(); return Void();
} }

View File

@ -68,7 +68,17 @@ struct SpecialKeySpaceCorrectnessWorkload : TestWorkload {
void getMetrics(std::vector<PerfMetric>& m) override {} void getMetrics(std::vector<PerfMetric>& m) override {}
// disable the default timeout setting // disable the default timeout setting
double getCheckTimeout() const override { return std::numeric_limits<double>::max(); } double getCheckTimeout() const override { return std::numeric_limits<double>::max(); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override { out.insert("RandomMoveKeys"); }
void disableFailureInjectionWorkloads(std::set<std::string>& out) const override {
out.insert("RandomMoveKeys");
// Rollback interferes with the
// \xff\xff/worker_interfaces test, since it can
// trigger a cluster recvoery, causing the worker
// interface for a machine to be updated in the middle
// of the test.
out.insert("RollbackWorkload");
}
Future<Void> _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) { Future<Void> _setup(Database cx, SpecialKeySpaceCorrectnessWorkload* self) {
cx->specialKeySpace = std::make_unique<SpecialKeySpace>(); cx->specialKeySpace = std::make_unique<SpecialKeySpace>();

View File

@ -18,9 +18,10 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fdbrpc/TenantName.h"
#include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h" #include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbrpc/TenantName.h" #include "fdbrpc/TenantName.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/workloads.actor.h"
@ -31,12 +32,16 @@
struct StorageQuotaWorkload : TestWorkload { struct StorageQuotaWorkload : TestWorkload {
static constexpr auto NAME = "StorageQuota"; static constexpr auto NAME = "StorageQuota";
TenantGroupName group;
TenantName tenant; TenantName tenant;
int nodeCount; int nodeCount;
TenantName emptyTenant;
StorageQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { StorageQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
nodeCount = getOption(options, "nodeCount"_sr, 10000); group = getOption(options, "group"_sr, "DefaultGroup"_sr);
tenant = getOption(options, "tenant"_sr, "DefaultTenant"_sr); tenant = getOption(options, "tenant"_sr, "DefaultTenant"_sr);
nodeCount = getOption(options, "nodeCount"_sr, 10000);
emptyTenant = getOption(options, "emptyTenant"_sr, "DefaultTenant"_sr);
} }
Future<Void> setup(Database const& cx) override { Future<Void> setup(Database const& cx) override {
@ -67,27 +72,42 @@ struct StorageQuotaWorkload : TestWorkload {
Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(keyForIndex(n), value((n + 1) % nodeCount)); } Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(keyForIndex(n), value((n + 1) % nodeCount)); }
ACTOR Future<Void> _start(StorageQuotaWorkload* self, Database cx) { ACTOR Future<Void> _start(StorageQuotaWorkload* self, Database cx) {
// Check that the quota set/get functions work as expected. state TenantMapEntry entry1 = wait(TenantAPI::getTenant(cx.getReference(), self->tenant));
// Set the quota to just below the current size. state TenantMapEntry entry2 = wait(TenantAPI::getTenant(cx.getReference(), self->emptyTenant));
ASSERT(entry1.tenantGroup.present() && entry1.tenantGroup.get() == self->group &&
entry2.tenantGroup.present() && entry2.tenantGroup.get() == self->group);
// Get the size of the non-empty tenant. We will set the quota of the tenant group
// to just below the current size of this tenant.
state int64_t size = wait(getSize(cx, self->tenant)); state int64_t size = wait(getSize(cx, self->tenant));
state int64_t quota = size - 1; state int64_t quota = size - 1;
wait(setStorageQuotaHelper(cx, self->tenant, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->tenant)); // Check that the quota set/get functions work as expected.
wait(setStorageQuotaHelper(cx, self->group, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->group));
ASSERT(quotaRead.present() && quotaRead.get() == quota); ASSERT(quotaRead.present() && quotaRead.get() == quota);
if (!SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED) { if (!SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
return Void(); return Void();
} }
// Check that writes are rejected when the tenant is over quota. // Check that writes to both the tenants are rejected when the group is over quota.
state bool rejected = wait(tryWrite(self, cx, /*expectOk=*/false)); state bool rejected1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/false));
ASSERT(rejected); ASSERT(rejected1);
state bool rejected2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/false));
ASSERT(rejected2);
// Increase the quota. Check that writes are now able to commit. // Increase the quota or clear the quota. Check that writes to both the tenants are now able to commit.
quota = size * 2; if (deterministicRandom()->coinflip()) {
wait(setStorageQuotaHelper(cx, self->tenant, quota)); quota = size * 2;
state bool committed = wait(tryWrite(self, cx, /*expectOk=*/true)); wait(setStorageQuotaHelper(cx, self->group, quota));
ASSERT(committed); } else {
wait(clearStorageQuotaHelper(cx, self->group));
}
state bool committed1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/true));
ASSERT(committed1);
state bool committed2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/true));
ASSERT(committed2);
return Void(); return Void();
} }
@ -115,11 +135,11 @@ struct StorageQuotaWorkload : TestWorkload {
} }
} }
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantName tenantName, int64_t quota) { ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName, int64_t quota) {
state Transaction tr(cx); state Transaction tr(cx);
loop { loop {
try { try {
setStorageQuota(tr, tenantName, quota); setStorageQuota(tr, tenantGroupName, quota);
wait(tr.commit()); wait(tr.commit());
return Void(); return Void();
} catch (Error& e) { } catch (Error& e) {
@ -128,12 +148,24 @@ struct StorageQuotaWorkload : TestWorkload {
} }
} }
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantName tenantName) { ACTOR static Future<Void> clearStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx); state Transaction tr(cx);
loop { loop {
try { try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName)); clearStorageQuota(tr, tenantGroupName);
wait(tr.commit()); wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx);
loop {
try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantGroupName));
return quota; return quota;
} catch (Error& e) { } catch (Error& e) {
wait(tr.onError(e)); wait(tr.onError(e));
@ -141,13 +173,13 @@ struct StorageQuotaWorkload : TestWorkload {
} }
} }
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, bool expectOk) { ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, TenantName tenant, bool expectOk) {
state int i; state int i;
// Retry the transaction a few times if needed; this allows us wait for a while for all // Retry the transaction a few times if needed; this allows us wait for a while for all
// the storage usage and quota related monitors to fetch and propagate the latest information // the storage usage and quota related monitors to fetch and propagate the latest information
// about the tenants that are over storage quota. // about the tenants that are over storage quota.
for (i = 0; i < 10; i++) { for (i = 0; i < 10; i++) {
state Transaction tr(cx, self->tenant); state Transaction tr(cx, tenant);
loop { loop {
try { try {
Standalone<KeyValueRef> kv = Standalone<KeyValueRef> kv =

View File

@ -118,14 +118,14 @@ Arena::Arena(Arena&& r) noexcept = default;
Arena& Arena::operator=(const Arena& r) = default; Arena& Arena::operator=(const Arena& r) = default;
Arena& Arena::operator=(Arena&& r) noexcept = default; Arena& Arena::operator=(Arena&& r) noexcept = default;
void Arena::dependsOn(const Arena& p) { void Arena::dependsOn(const Arena& p) {
if (p.impl) { // x.dependsOn(y) is a no-op if they refer to the same ArenaBlocks.
// They will already have the same lifetime.
if (p.impl && p.impl.getPtr() != impl.getPtr()) {
allowAccess(impl.getPtr()); allowAccess(impl.getPtr());
allowAccess(p.impl.getPtr()); allowAccess(p.impl.getPtr());
ArenaBlock::dependOn(impl, p.impl.getPtr()); ArenaBlock::dependOn(impl, p.impl.getPtr());
disallowAccess(p.impl.getPtr()); disallowAccess(p.impl.getPtr());
if (p.impl.getPtr() != impl.getPtr()) { disallowAccess(impl.getPtr());
disallowAccess(impl.getPtr());
}
} }
} }
@ -297,6 +297,7 @@ void* ArenaBlock::make4kAlignedBuffer(uint32_t size) {
} }
void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) { void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
ASSERT(self->getData() != other->getData());
other->addref(); other->addref();
if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef)) if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef))
create(SMALL, self)->makeReference(other); create(SMALL, self)->makeReference(other);
@ -775,6 +776,16 @@ TEST_CASE("/flow/Arena/Size") {
return Void(); return Void();
} }
// Test that x.dependsOn(x) works, and is effectively a no-op.
TEST_CASE("/flow/Arena/SelfRef") {
Arena a(4096);
// This should be a no-op.
a.dependsOn(a);
return Void();
}
TEST_CASE("flow/StringRef/eat") { TEST_CASE("flow/StringRef/eat") {
StringRef str = "test/case"_sr; StringRef str = "test/case"_sr;
StringRef first = str.eat("/"); StringRef first = str.eat("/");
@ -815,4 +826,4 @@ TEST_CASE("flow/StringRef/eat") {
ASSERT(str == ""_sr); ASSERT(str == ""_sr);
return Void(); return Void();
} }

View File

@ -29,21 +29,25 @@
#define PRIORITYMULTILOCK_ACTOR_H #define PRIORITYMULTILOCK_ACTOR_H
#include "flow/flow.h" #include "flow/flow.h"
#include <boost/intrusive/list.hpp>
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
#define PRIORITYMULTILOCK_DEBUG 0 #define PRIORITYMULTILOCK_DEBUG 0
#if PRIORITYMULTILOCK_DEBUG || !defined(NO_INTELLISENSE) #if PRIORITYMULTILOCK_DEBUG || !defined(NO_INTELLISENSE)
#define pml_debug_printf(...) \ #define pml_debug_printf(...) \
if (now() > 0) \ if (now() > 0) { \
printf(__VA_ARGS__) printf("pml line=%04d ", __LINE__); \
printf(__VA_ARGS__); \
}
#else #else
#define pml_debug_printf(...) #define pml_debug_printf(...)
#endif #endif
// A multi user lock with a concurrent holder limit where waiters request a lock with a priority // A multi user lock with a concurrent holder limit where waiters request a lock with a priority
// id and are granted locks based on a total concurrency and relative weights of the current active // id and are granted locks based on a total concurrency and relative weights of the current active
// priorities. Priority id's must start at 0 and are sequential integers. // priorities. Priority id's must start at 0 and are sequential integers. Priority id numbers
// are not related to the importance of the priority in execution.
// //
// Scheduling logic // Scheduling logic
// Let // Let
@ -64,17 +68,17 @@
// The interface is similar to FlowMutex except that lock holders can just drop the lock to release it. // The interface is similar to FlowMutex except that lock holders can just drop the lock to release it.
// //
// Usage: // Usage:
// Lock lock = wait(prioritylock.lock(priorityLevel)); // Lock lock = wait(prioritylock.lock(priority_id));
// lock.release(); // Explicit release, or // lock.release(); // Explicit release, or
// // let lock and all copies of lock go out of scope to release // // let lock and all copies of lock go out of scope to release
class PriorityMultiLock { class PriorityMultiLock : public ReferenceCounted<PriorityMultiLock> {
public: public:
// Waiting on the lock returns a Lock, which is really just a Promise<Void> // Waiting on the lock returns a Lock, which is really just a Promise<Void>
// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release // Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
// the Lock before it goes out of scope. // the Lock before it goes out of scope.
struct Lock { struct Lock {
void release() { promise.send(Void()); } void release() { promise.send(Void()); }
bool isLocked() const { return promise.canBeSet(); }
// This is exposed in case the caller wants to use/copy it directly // This is exposed in case the caller wants to use/copy it directly
Promise<Void> promise; Promise<Void> promise;
@ -84,10 +88,11 @@ public:
: PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {} : PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {}
PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority) PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority)
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0), releaseDebugID(0) { : concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0) {
priorities.resize(weightsByPriority.size()); priorities.resize(weightsByPriority.size());
for (int i = 0; i < priorities.size(); ++i) { for (int i = 0; i < priorities.size(); ++i) {
priorities[i].priority = i;
priorities[i].weight = weightsByPriority[i]; priorities[i].weight = weightsByPriority[i];
} }
@ -102,7 +107,8 @@ public:
// If this priority currently has no waiters // If this priority currently has no waiters
if (q.empty()) { if (q.empty()) {
// Add this priority's weight to the total for priorities with pending work // Add this priority's weight to the total for priorities with pending work. This must be done
// so that currenctCapacity() below will assign capacaity to this priority.
totalPendingWeights += p.weight; totalPendingWeights += p.weight;
// If there are slots available and the priority has capacity then don't make the caller wait // If there are slots available and the priority has capacity then don't make the caller wait
@ -114,80 +120,69 @@ public:
Lock lock; Lock lock;
addRunner(lock, &p); addRunner(lock, &p);
pml_debug_printf("lock nowait line %d priority %d %s\n", __LINE__, priority, toString().c_str()); pml_debug_printf("lock nowait priority %d %s\n", priority, toString().c_str());
return lock; return lock;
} }
// If we didn't return above then add the priority to the waitingPriorities list
waitingPriorities.push_back(p);
} }
Waiter w; Waiter& w = q.emplace_back();
q.push_back(w);
++waiting; ++waiting;
pml_debug_printf("lock wait line %d priority %d %s\n", __LINE__, priority, toString().c_str()); pml_debug_printf("lock wait priority %d %s\n", priority, toString().c_str());
return w.lockPromise.getFuture(); return w.lockPromise.getFuture();
} }
void kill() { void kill() {
pml_debug_printf("kill %s\n", toString().c_str());
brokenOnDestruct.reset(); brokenOnDestruct.reset();
// handleRelease will not free up any execution slots when it ends via cancel // handleRelease will not free up any execution slots when it ends via cancel
fRunner.cancel(); fRunner.cancel();
available = 0; available = 0;
runners.clear();
waitingPriorities.clear();
priorities.clear(); priorities.clear();
} }
std::string toString() const { std::string toString() const {
int runnersDone = 0; std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d "
for (int i = 0; i < runners.size(); ++i) { "pendingWeights=%d ",
if (runners[i].isReady()) {
++runnersDone;
}
}
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d "
"runnersDone=%d pendingWeights=%d ",
this, this,
concurrency, concurrency,
available, available,
concurrency - available, concurrency - available,
waiting, waiting,
runners.size(),
runnersDone,
totalPendingWeights); totalPendingWeights);
for (int i = 0; i < priorities.size(); ++i) { for (auto& p : priorities) {
s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str()); s += format("{%s} ", p.toString(this).c_str());
} }
s += "}"; s += "}";
if (concurrency - available != runners.size() - runnersDone) {
pml_debug_printf("%s\n", s.c_str());
ASSERT_EQ(concurrency - available, runners.size() - runnersDone);
}
return s; return s;
} }
int maxPriority() const { return priorities.size() - 1; } int maxPriority() const { return priorities.size() - 1; }
int totalWaiters() const { return waiting; } int getRunnersCount() const { return concurrency - available; }
int getWaitersCount() const { return waiting; }
int numWaiters(const unsigned int priority) const { int getWaitersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size()); ASSERT(priority < priorities.size());
return priorities[priority].queue.size(); return priorities[priority].queue.size();
} }
int totalRunners() const { return concurrency - available; } int getRunnersCount(const unsigned int priority) const {
int numRunners(const unsigned int priority) const {
ASSERT(priority < priorities.size()); ASSERT(priority < priorities.size());
return priorities[priority].runners; return priorities[priority].runners;
} }
private: private:
struct Waiter { struct Waiter {
Waiter() {}
Promise<Lock> lockPromise; Promise<Lock> lockPromise;
}; };
@ -202,8 +197,8 @@ private:
typedef Deque<Waiter> Queue; typedef Deque<Waiter> Queue;
struct Priority { struct Priority : boost::intrusive::list_base_hook<> {
Priority() : runners(0), weight(0) {} Priority() : runners(0), weight(0), priority(-1) {}
// Queue of waiters at this priority // Queue of waiters at this priority
Queue queue; Queue queue;
@ -211,9 +206,12 @@ private:
int runners; int runners;
// Configured weight for this priority // Configured weight for this priority
int weight; int weight;
// Priority number for convenience, matches *this's index in PML priorities vector
int priority;
std::string toString(const PriorityMultiLock* pml) const { std::string toString(const PriorityMultiLock* pml) const {
return format("weight=%d run=%d wait=%d cap=%d", return format("priority=%d weight=%d run=%d wait=%d cap=%d",
priority,
weight, weight,
runners, runners,
queue.size(), queue.size(),
@ -222,51 +220,41 @@ private:
}; };
std::vector<Priority> priorities; std::vector<Priority> priorities;
typedef boost::intrusive::list<Priority, boost::intrusive::constant_time_size<false>> WaitingPrioritiesList;
// Current or recent (ended) runners // List of all priorities with 1 or more waiters. This list exists so that the scheduling loop
Deque<Future<Void>> runners; // does not have to iterage over the priorities vector checking priorities without waiters.
WaitingPrioritiesList waitingPriorities;
Future<Void> fRunner; Future<Void> fRunner;
AsyncTrigger wakeRunner; AsyncTrigger wakeRunner;
Promise<Void> brokenOnDestruct; Promise<Void> brokenOnDestruct;
// Used for debugging, can roll over without issue ACTOR static void handleRelease(Reference<PriorityMultiLock> self, Priority* priority, Future<Void> holder) {
unsigned int releaseDebugID; pml_debug_printf("%f handleRelease self=%p start\n", now(), self.getPtr());
ACTOR static Future<Void> handleRelease(PriorityMultiLock* self, Future<Void> f, Priority* priority) {
state [[maybe_unused]] unsigned int id = self->releaseDebugID++;
pml_debug_printf("%f handleRelease self=%p id=%u start \n", now(), self, id);
try { try {
wait(f); wait(holder);
pml_debug_printf("%f handleRelease self=%p id=%u success\n", now(), self, id); pml_debug_printf("%f handleRelease self=%p success\n", now(), self.getPtr());
} catch (Error& e) { } catch (Error& e) {
pml_debug_printf("%f handleRelease self=%p id=%u error %s\n", now(), self, id, e.what()); pml_debug_printf("%f handleRelease self=%p error %s\n", now(), self.getPtr(), e.what());
if (e.code() == error_code_actor_cancelled) {
throw;
}
} }
pml_debug_printf("lock release line %d priority %d %s\n", pml_debug_printf("lock release priority %d %s\n", (int)(priority->priority), self->toString().c_str());
__LINE__,
(int)(priority - &self->priorities.front()),
self->toString().c_str());
pml_debug_printf("%f handleRelease self=%p id=%u releasing\n", now(), self, id); pml_debug_printf("%f handleRelease self=%p releasing\n", now(), self.getPtr());
++self->available; ++self->available;
priority->runners -= 1; priority->runners -= 1;
// If there are any waiters or if the runners array is getting large, trigger the runner loop // If there are any waiters or if the runners array is getting large, trigger the runner loop
if (self->waiting > 0 || self->runners.size() > 1000) { if (self->waiting > 0) {
self->wakeRunner.trigger(); self->wakeRunner.trigger();
} }
return Void();
} }
void addRunner(Lock& lock, Priority* p) { void addRunner(Lock& lock, Priority* priority) {
p->runners += 1; priority->runners += 1;
--available; --available;
runners.push_back(handleRelease(this, lock.promise.getFuture(), p)); handleRelease(Reference<PriorityMultiLock>::addRef(this), priority, lock.promise.getFuture());
} }
// Current maximum running tasks for the specified priority, which must have waiters // Current maximum running tasks for the specified priority, which must have waiters
@ -278,76 +266,50 @@ private:
} }
ACTOR static Future<Void> runner(PriorityMultiLock* self) { ACTOR static Future<Void> runner(PriorityMultiLock* self) {
state int sinceYield = 0;
state Future<Void> error = self->brokenOnDestruct.getFuture(); state Future<Void> error = self->brokenOnDestruct.getFuture();
// Priority to try to run tasks from next // Priority to try to run tasks from next
state int priority = 0; state WaitingPrioritiesList::iterator p = self->waitingPriorities.end();
loop { loop {
pml_debug_printf( pml_debug_printf("runner loop start priority=%d %s\n", p->priority, self->toString().c_str());
"runner loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
// Cleanup finished runner futures at the front of the runner queue.
while (!self->runners.empty() && self->runners.front().isReady()) {
self->runners.pop_front();
}
// Wait for a runner to release its lock // Wait for a runner to release its lock
pml_debug_printf( pml_debug_printf("runner loop waitTrigger priority=%d %s\n", p->priority, self->toString().c_str());
"runner loop waitTrigger line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
wait(self->wakeRunner.onTrigger()); wait(self->wakeRunner.onTrigger());
pml_debug_printf( pml_debug_printf("%f runner loop wake priority=%d %s\n", now(), p->priority, self->toString().c_str());
"%f runner loop wake line %d priority=%d %s\n", now(), __LINE__, priority, self->toString().c_str());
if (++sinceYield == 100) {
sinceYield = 0;
pml_debug_printf(
" runner waitDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
wait(delay(0));
pml_debug_printf(
" runner afterDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
}
// While there are available slots and there are waiters, launch tasks // While there are available slots and there are waiters, launch tasks
while (self->available > 0 && self->waiting > 0) { while (self->available > 0 && self->waiting > 0) {
pml_debug_printf( pml_debug_printf(" launch loop start priority=%d %s\n", p->priority, self->toString().c_str());
" launch loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
Priority* pPriority;
// Find the next priority with waiters and capacity. There must be at least one. // Find the next priority with waiters and capacity. There must be at least one.
loop { loop {
// Rotate to next priority if (p == self->waitingPriorities.end()) {
if (++priority == self->priorities.size()) { p = self->waitingPriorities.begin();
priority = 0;
} }
pPriority = &self->priorities[priority]; pml_debug_printf(" launch loop scan priority=%d %s\n", p->priority, self->toString().c_str());
pml_debug_printf(" launch loop scan line %d priority=%d %s\n", if (!p->queue.empty() && p->runners < self->currentCapacity(p->weight)) {
__LINE__,
priority,
self->toString().c_str());
if (!pPriority->queue.empty() && pPriority->runners < self->currentCapacity(pPriority->weight)) {
break; break;
} }
++p;
} }
Queue& queue = pPriority->queue; Queue& queue = p->queue;
Waiter w = queue.front(); Waiter w = queue.front();
queue.pop_front(); queue.pop_front();
// If this priority is now empty, subtract its weight from the total pending weights // If this priority is now empty, subtract its weight from the total pending weights an remove it
// from the waitingPriorities list
Priority* pPriority = &*p;
if (queue.empty()) { if (queue.empty()) {
p = self->waitingPriorities.erase(p);
self->totalPendingWeights -= pPriority->weight; self->totalPendingWeights -= pPriority->weight;
pml_debug_printf(" emptied priority line %d priority=%d %s\n", pml_debug_printf(
__LINE__, " emptied priority priority=%d %s\n", pPriority->priority, self->toString().c_str());
priority,
self->toString().c_str());
} }
--self->waiting; --self->waiting;
@ -365,10 +327,9 @@ private:
self->addRunner(lock, pPriority); self->addRunner(lock, pPriority);
} }
pml_debug_printf(" launched line %d alreadyDone=%d priority=%d %s\n", pml_debug_printf(" launched alreadyDone=%d priority=%d %s\n",
__LINE__,
!lock.promise.canBeSet(), !lock.promise.canBeSet(),
priority, pPriority->priority,
self->toString().c_str()); self->toString().c_str());
} }
} }

View File

@ -0,0 +1,180 @@
/*
* BenchBlobDeltaFiles.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "benchmark/benchmark.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "flow/IRandom.h"
#include "flow/DeterministicRandom.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "flow/flow.h"
#include <cstdlib>
#include <stdexcept>
// Pre-generated GranuleDelta size in bytes for benchmark.
const static int PRE_GEN_TARGET_BYTES[] = { 128 * 1024, 512 * 1024, 1024 * 1024 };
// Generate GranuleDelta using a deterministic way. Change the seed if you would test a new data set
class DeltaGenerator {
public:
DeltaGenerator(uint32_t seed = 12345678) {
randGen = Reference<IRandom>(new DeterministicRandom(seed));
// Generate key range
prefix = StringRef(ar, randGen->randomUniqueID().toString() + "_");
range = KeyRangeRef(prefix, StringRef(ar, strinc(prefix)));
// Generate version jump size
minVersionJump = randGen->randomExp(0, 25);
maxVersionJump = minVersionJump + randGen->randomExp(0, 25);
// Generate value size range
maxValueSize = randGen->randomExp(7, 9);
// Generate start version
version = randGen->randomUInt32();
// Generate probabilty of update existing keys
updateExistingKeysProb = randGen->random01();
// Generate deltas
for (auto i : PRE_GEN_TARGET_BYTES) {
genDeltas(i);
}
fmt::print("key range: {} - {}\n", range.begin.printable(), range.end.printable());
fmt::print("start version: {}\n", version);
fmt::print("max value bytes: {}\n", maxValueSize);
fmt::print("version jump range: {} - {}\n", minVersionJump, maxVersionJump);
fmt::print("probability for update: {}\n", updateExistingKeysProb);
fmt::print("unseed: {}\n", randGen->randomUInt32());
}
KeyRange getRange() { return range; }
Standalone<GranuleDeltas> getDelta(int targetBytes) {
if (deltas.find(targetBytes) != deltas.end()) {
return deltas[targetBytes];
}
throw std::invalid_argument("Test delta file size is not pre-generated!");
}
private:
void genDeltas(int targetBytes) {
Standalone<GranuleDeltas> data;
int totalDataBytes = 0;
while (totalDataBytes < targetBytes) {
data.push_back(ar, newDelta());
totalDataBytes += data.back().expectedSize();
}
deltas[targetBytes] = data;
}
MutationRef newMutation() { return MutationRef(ar, MutationRef::SetValue, key(), value()); }
MutationsAndVersionRef newDelta() {
version += randGen->randomInt(minVersionJump, maxVersionJump);
MutationsAndVersionRef ret(version, version);
for (int i = 0; i < 10; i++) {
ret.mutations.push_back_deep(ar, newMutation());
}
return ret;
}
StringRef key() {
// Pick an existing key
if (randGen->random01() < updateExistingKeysProb && !usedKeys.empty()) {
int r = randGen->randomUInt32() % usedKeys.size();
auto it = usedKeys.begin();
for (; r != 0; r--)
it++;
return StringRef(ar, *it);
}
// Create a new key
std::string key = prefix.toString() + randGen->randomUniqueID().toString();
usedKeys.insert(key);
return StringRef(ar, key);
}
StringRef value() {
int valueSize = randGen->randomInt(maxValueSize / 2, maxValueSize * 3 / 2);
std::string value = randGen->randomUniqueID().toString();
if (value.size() > valueSize) {
value = value.substr(0, valueSize);
}
if (value.size() < valueSize) {
// repeated string so it's compressible
value += std::string(valueSize - value.size(), 'x');
}
return StringRef(ar, value);
}
Reference<IRandom> randGen;
Arena ar;
KeyRangeRef range;
Key prefix;
int maxValueSize;
Version version;
int minVersionJump;
int maxVersionJump;
std::set<std::string> usedKeys;
double updateExistingKeysProb;
std::map<int, Standalone<GranuleDeltas>> deltas;
};
static DeltaGenerator deltaGen; // Pre-generate deltas
// Benchmark serialization without compression/encryption. The main CPU cost should be sortDeltasByKey
static void bench_serialize_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
int chunkSize = state.range(1);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
Standalone<StringRef> fileName = "testdelta"_sr; // unused
Optional<CompressionFilter> compressFilter; // unused. no compression
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx; // unused. no encryption
uint32_t serializedBytes = 0;
for (auto _ : state) {
Value serialized = serializeChunkedDeltaFile(fileName, delta, range, chunkSize, compressFilter, cipherKeysCtx);
serializedBytes += serialized.size();
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
state.counters["serialized_bytes"] = serializedBytes;
}
// Benchmark sorting deltas
static void bench_sort_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
for (auto _ : state) {
sortDeltasByKey(delta, range);
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
}
// Benchmark serialization for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_serialize_deltas)
->Args({ 128 * 1024, 32 * 1024 })
->Args({ 512 * 1024, 32 * 1024 })
->Args({ 1024 * 1024, 32 * 1024 });
// Benchmark sorting for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_sort_deltas)->Args({ 128 * 1024 })->Args({ 512 * 1024 })->Args({ 1024 * 1024 });

View File

@ -25,26 +25,28 @@
#include "flow/PriorityMultiLock.actor.h" #include "flow/PriorityMultiLock.actor.h"
#include <deque> #include <deque>
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
#include "fmt/printf.h"
ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) { ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
state std::vector<int> priorities; // Arg1 is the number of active priorities to use
// Arg2 is the number of inactive priorities to use
state int active = benchState->range(0);
state int inactive = benchState->range(1);
// Set up priority list with limits 10, 20, 30, ... // Set up priority list with limits 10, 20, 30, ...
while (priorities.size() < benchState->range(0)) { state std::vector<int> priorities;
while (priorities.size() < active + inactive) {
priorities.push_back(10 * (priorities.size() + 1)); priorities.push_back(10 * (priorities.size() + 1));
} }
state int concurrency = priorities.size() * 10; state int concurrency = priorities.size() * 10;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities); state Reference<PriorityMultiLock> pml = makeReference<PriorityMultiLock>(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks // Clog the lock buy taking n=concurrency locks
state std::deque<Future<PriorityMultiLock::Lock>> lockFutures; state std::deque<Future<PriorityMultiLock::Lock>> lockFutures;
for (int j = 0; j < concurrency; ++j) { for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(j % priorities.size())); lockFutures.push_back(pml->lock(j % active));
} }
// Wait for all of the initial locks to be taken // Wait for all of the initial locks to be taken
// This will work regardless of their priorities as there are only n = concurrency of them // This will work regardless of their priorities as there are only n = concurrency of them
wait(waitForAll(std::vector<Future<PriorityMultiLock::Lock>>(lockFutures.begin(), lockFutures.end()))); wait(waitForAll(std::vector<Future<PriorityMultiLock::Lock>>(lockFutures.begin(), lockFutures.end())));
@ -64,7 +66,7 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
PriorityMultiLock::Lock lock = wait(f); PriorityMultiLock::Lock lock = wait(f);
// Rotate to another priority // Rotate to another priority
if (++p == priorities.size()) { if (++p == active) {
p = 0; p = 0;
} }
@ -76,7 +78,6 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
benchState->SetItemsProcessed(static_cast<long>(benchState->iterations())); benchState->SetItemsProcessed(static_cast<long>(benchState->iterations()));
delete pml;
return Void(); return Void();
} }
@ -84,4 +85,4 @@ static void bench_priorityMultiLock(benchmark::State& benchState) {
onMainThread([&benchState]() { return benchPriorityMultiLock(&benchState); }).blockUntilReady(); onMainThread([&benchState]() { return benchPriorityMultiLock(&benchState); }).blockUntilReady();
} }
BENCHMARK(bench_priorityMultiLock)->DenseRange(1, 8)->ReportAggregatesOnly(true); BENCHMARK(bench_priorityMultiLock)->Args({ 5, 0 })->Ranges({ { 1, 64 }, { 0, 128 } })->ReportAggregatesOnly(true);

View File

@ -334,9 +334,6 @@ logdir = {logdir}
db_config += " blob_granules_enabled:=1" db_config += " blob_granules_enabled:=1"
self.fdbcli_exec(db_config) self.fdbcli_exec(db_config)
if self.blob_granules_enabled:
self.fdbcli_exec("blobrange start \\x00 \\xff")
# Generate and install test certificate chains and keys # Generate and install test certificate chains and keys
def create_tls_cert(self): def create_tls_cert(self):
assert self.tls_config is not None, "TLS not enabled" assert self.tls_config is not None, "TLS not enabled"

View File

@ -6,6 +6,7 @@ enable_encryption = true
enable_tlog_encryption = true enable_tlog_encryption = true
enable_storage_server_encryption = false enable_storage_server_encryption = false
enable_blob_granule_encryption = true enable_blob_granule_encryption = true
max_write_transaction_life_versions = 5000000
[[test]] [[test]]
testTitle = 'EncryptedBackupAndRestore' testTitle = 'EncryptedBackupAndRestore'

View File

@ -8,20 +8,36 @@ testTitle = 'TenantCreation'
[[test.workload]] [[test.workload]]
testName = 'CreateTenant' testName = 'CreateTenant'
name = 'First' name = 'First'
group = 'GroupA'
[[test.workload]] [[test.workload]]
testName = 'CreateTenant' testName = 'CreateTenant'
name = 'Second' name = 'Second'
group = 'GroupA'
[[test.workload]]
testName = 'CreateTenant'
name = 'Third'
group = 'GroupB'
[[test.workload]]
testName = 'CreateTenant'
name = 'Fourth'
group = 'GroupB'
[[test]] [[test]]
testTitle = 'StorageQuota' testTitle = 'StorageQuota'
[[test.workload]] [[test.workload]]
testName = 'StorageQuota' testName = 'StorageQuota'
group = 'GroupA'
tenant = 'First' tenant = 'First'
nodeCount = 250000 nodeCount = 250000
emptyTenant = 'Second'
[[test.workload]] [[test.workload]]
testName = 'StorageQuota' testName = 'StorageQuota'
tenant = 'Second' group = 'GroupB'
tenant = 'Third'
nodeCount = 25000 nodeCount = 25000
emptyTenant = 'Fourth'