solve merge conflict upstream/main

This commit is contained in:
Xiaoxi Wang 2022-11-15 14:35:47 -08:00
commit 907d7af966
85 changed files with 1892 additions and 861 deletions

View File

@ -70,10 +70,13 @@ void ApiWorkload::start() {
schedule([this]() {
// 1. Clear data
clearData([this]() {
// 2. Populate initial data
populateData([this]() {
// 3. Generate random workload
runTests();
// 2. Workload setup
setup([this]() {
// 3. Populate initial data
populateData([this]() {
// 4. Generate random workload
runTests();
});
});
});
});
@ -249,6 +252,10 @@ void ApiWorkload::populateData(TTaskFct cont) {
}
}
void ApiWorkload::setup(TTaskFct cont) {
schedule(cont);
}
void ApiWorkload::randomInsertOp(TTaskFct cont, std::optional<int> tenantId) {
int numKeys = Random::get().randomInt(1, maxKeysPerTransaction);
auto kvPairs = std::make_shared<std::vector<fdb::KeyValue>>();
@ -322,4 +329,85 @@ std::optional<fdb::BytesRef> ApiWorkload::getTenant(std::optional<int> tenantId)
}
}
std::string ApiWorkload::debugTenantStr(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format("(tenant {0})", tenantId.value()) : "()";
}
// BlobGranule setup.
// This blobbifies ['\x00', '\xff') per tenant or for the whole database if there are no tenants.
void ApiWorkload::setupBlobGranules(TTaskFct cont) {
// This count is used to synchronize the # of tenant blobbifyRange() calls to ensure
// we only start the workload once blobbification has fully finished.
auto blobbifiedCount = std::make_shared<std::atomic<int>>(1);
if (tenants.empty()) {
blobbifiedCount->store(1);
blobbifyTenant({}, blobbifiedCount, cont);
} else {
blobbifiedCount->store(tenants.size());
for (int i = 0; i < tenants.size(); i++) {
schedule([=]() { blobbifyTenant(i, blobbifiedCount, cont); });
}
}
}
void ApiWorkload::blobbifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retBlobbifyRange = std::make_shared<bool>(false);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: blobbifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->blobbifyRange(begin, end).eraseType();
ctx->continueAfter(f, [ctx, retBlobbifyRange, f]() {
*retBlobbifyRange = f.get<fdb::future_var::Bool>();
ctx->done();
});
},
[=]() {
if (!*retBlobbifyRange) {
schedule([=]() { blobbifyTenant(tenantId, blobbifiedCount, cont); });
} else {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
void ApiWorkload::verifyTenant(std::optional<int> tenantId,
std::shared_ptr<std::atomic<int>> blobbifiedCount,
TTaskFct cont) {
auto retVerifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[=](auto ctx) {
fdb::Key begin(1, '\x00');
fdb::Key end(1, '\xff');
info(fmt::format("setup: verifying {}: [\\x00 - \\xff)\n", debugTenantStr(tenantId)));
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, /*latest_version*/ -2).eraseType();
ctx->continueAfter(f, [ctx, retVerifyVersion, f]() {
*retVerifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
});
},
[=]() {
if (*retVerifyVersion == -1) {
schedule([=]() { verifyTenant(tenantId, blobbifiedCount, cont); });
} else {
if (blobbifiedCount->fetch_sub(1) == 1) {
schedule(cont);
}
}
},
/*tenant=*/getTenant(tenantId),
/* failOnError = */ false);
}
} // namespace FdbApiTester

View File

@ -41,6 +41,9 @@ public:
virtual void checkProgress() override;
// Workload specific setup phase.
virtual void setup(TTaskFct cont);
// Running specific tests
// The default implementation generates a workload consisting of
// random operations generated by randomOperation
@ -126,6 +129,12 @@ protected:
void randomClearRangeOp(TTaskFct cont, std::optional<int> tenantId);
std::optional<fdb::BytesRef> getTenant(std::optional<int> tenantId);
std::string debugTenantStr(std::optional<int> tenantId);
// Generic BlobGranules setup.
void setupBlobGranules(TTaskFct cont);
void blobbifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
void verifyTenant(std::optional<int> tenantId, std::shared_ptr<std::atomic<int>> blobbifiedCount, TTaskFct cont);
private:
void populateDataTx(TTaskFct cont, std::optional<int> tenantId);

View File

@ -52,26 +52,23 @@ private:
};
std::vector<OpType> excludedOpTypes;
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet
// FIXME: should still guarantee a read succeeds eventually somehow
// FIXME: this needs to be per tenant if tenant ids are set
std::unordered_set<std::optional<int>> tenantsWithReadSuccess;
inline void setReadSuccess(std::optional<int> tenantId) { tenantsWithReadSuccess.insert(tenantId); }
inline bool seenReadSuccess(std::optional<int> tenantId) { return tenantsWithReadSuccess.count(tenantId); }
std::string tenantDebugString(std::optional<int> tenantId) {
return tenantId.has_value() ? fmt::format(" (tenant {0})", tenantId.value()) : "";
}
void debugOp(std::string opName, fdb::Key begin, fdb::Key end, std::optional<int> tenantId, std::string message) {
if (BG_API_DEBUG_VERBOSE) {
info(fmt::format("{0}: [{1} - {2}){3}: {4}",
info(fmt::format("{0}: [{1} - {2}) {3}: {4}",
opName,
fdb::toCharsRef(begin),
fdb::toCharsRef(end),
tenantDebugString(tenantId),
debugTenantStr(tenantId),
message));
}
}
@ -117,7 +114,7 @@ private:
results.get()->assign(resVector.begin(), resVector.end());
bool previousSuccess = seenReadSuccess(tenantId);
if (!previousSuccess) {
info(fmt::format("Read{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Read {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
} else {
debugOp("Read", begin, end, tenantId, "complete");
@ -289,20 +286,19 @@ private:
}
// TODO: tenant support
void randomGetBlobRangesOp(TTaskFct cont) {
void randomGetBlobRangesOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
auto results = std::make_shared<std::vector<fdb::KeyRange>>();
if (begin > end) {
std::swap(begin, end);
}
std::optional<int> tenantId = {};
debugOp("GetBlobRanges", begin, end, tenantId, "starting");
execOperation(
[begin, end, results](auto ctx) {
fdb::Future f = ctx->db().listBlobbifiedRanges(begin, end, 1000).eraseType();
fdb::Future f = ctx->dbOps()->listBlobbifiedRanges(begin, end, 1000).eraseType();
ctx->continueAfter(f, [ctx, f, results]() {
*results = copyKeyRangeArray(f.get<fdb::future_var::KeyRangeRefArray>());
ctx->done();
@ -314,25 +310,24 @@ private:
this->validateRanges(results, begin, end, seenReadSuccess(tenantId));
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
// TODO: tenant support
void randomVerifyOp(TTaskFct cont) {
void randomVerifyOp(TTaskFct cont, std::optional<int> tenantId) {
fdb::Key begin = randomKeyName();
fdb::Key end = randomKeyName();
std::optional<int> tenantId;
if (begin > end) {
std::swap(begin, end);
}
auto verifyVersion = std::make_shared<int64_t>(false);
debugOp("Verify", begin, end, tenantId, "starting");
auto verifyVersion = std::make_shared<int64_t>(-1);
execOperation(
[begin, end, verifyVersion](auto ctx) {
fdb::Future f = ctx->db().verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
fdb::Future f = ctx->dbOps()->verifyBlobRange(begin, end, -2 /* latest version*/).eraseType();
ctx->continueAfter(f, [ctx, verifyVersion, f]() {
*verifyVersion = f.get<fdb::future_var::Int64>();
ctx->done();
@ -344,15 +339,16 @@ private:
if (*verifyVersion == -1) {
ASSERT(!previousSuccess);
} else if (!previousSuccess) {
info(fmt::format("Verify{0}: first success\n", tenantDebugString(tenantId)));
info(fmt::format("Verify {0}: first success\n", debugTenantStr(tenantId)));
setReadSuccess(tenantId);
}
schedule(cont);
},
getTenant(tenantId),
/* failOnError = */ false);
}
void randomOperation(TTaskFct cont) {
void randomOperation(TTaskFct cont) override {
std::optional<int> tenantId = randomTenant();
OpType txType = (stores[tenantId].size() == 0) ? OP_INSERT : (OpType)Random::get().randomInt(0, OP_LAST);
@ -380,10 +376,10 @@ private:
randomSummarizeOp(cont, tenantId);
break;
case OP_GET_BLOB_RANGES:
randomGetBlobRangesOp(cont);
randomGetBlobRangesOp(cont, tenantId);
break;
case OP_VERIFY:
randomVerifyOp(cont);
randomVerifyOp(cont, tenantId);
break;
}
}

View File

@ -47,6 +47,8 @@ private:
OP_LAST = OP_CANCEL_PURGE
};
void setup(TTaskFct cont) override { setupBlobGranules(cont); }
// could add summarize too old and verify too old as ops if desired but those are lower value
// Allow reads at the start to get blob_granule_transaction_too_old if BG data isn't initialized yet

View File

@ -91,13 +91,15 @@ public:
fdbDb = executor->selectDatabase();
}
if (tenantName) {
fdbTenant = fdbDb.openTenant(*tenantName);
fdbDbOps = std::make_shared<fdb::Tenant>(fdbTenant);
} else {
fdbDbOps = std::make_shared<fdb::Database>(fdbDb);
}
if (transactional) {
if (tenantName) {
fdb::Tenant tenant = fdbDb.openTenant(*tenantName);
fdbTx = tenant.createTransaction();
} else {
fdbTx = fdbDb.createTransaction();
}
fdbTx = fdbDbOps->createTransaction();
}
}
@ -109,6 +111,10 @@ public:
fdb::Database db() override { return fdbDb.atomic_load(); }
fdb::Tenant tenant() override { return fdbTenant.atomic_load(); }
std::shared_ptr<fdb::IDatabaseOps> dbOps() override { return std::atomic_load(&fdbDbOps); }
fdb::Transaction tx() override { return fdbTx.atomic_load(); }
// Set a continuation to be executed when a future gets ready
@ -272,13 +278,17 @@ protected:
scheduler->schedule([thisRef]() {
fdb::Database db = thisRef->executor->selectDatabase();
thisRef->fdbDb.atomic_store(db);
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTenant.atomic_store(tenant);
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Tenant>(tenant)));
} else {
std::atomic_store(&thisRef->fdbDbOps,
std::dynamic_pointer_cast<fdb::IDatabaseOps>(std::make_shared<fdb::Database>(db)));
}
if (thisRef->transactional) {
if (thisRef->tenantName) {
fdb::Tenant tenant = db.openTenant(*thisRef->tenantName);
thisRef->fdbTx.atomic_store(tenant.createTransaction());
} else {
thisRef->fdbTx.atomic_store(db.createTransaction());
}
thisRef->fdbTx.atomic_store(thisRef->fdbDbOps->createTransaction());
}
thisRef->restartTransaction();
});
@ -317,6 +327,14 @@ protected:
// Provides a thread safe interface by itself (no need for mutex)
fdb::Database fdbDb;
// FDB tenant
// Provides a thread safe interface by itself (no need for mutex)
fdb::Tenant fdbTenant;
// FDB IDatabaseOps to hide database/tenant accordingly.
// Provides a shared pointer to database functions based on if db or tenant.
std::shared_ptr<fdb::IDatabaseOps> fdbDbOps;
// FDB transaction
// Provides a thread safe interface by itself (no need for mutex)
fdb::Transaction fdbTx;

View File

@ -41,6 +41,12 @@ public:
// Current FDB database
virtual fdb::Database db() = 0;
// Current FDB tenant
virtual fdb::Tenant tenant() = 0;
// Current FDB IDatabaseOps
virtual std::shared_ptr<fdb::IDatabaseOps> dbOps() = 0;
// Current FDB transaction
virtual fdb::Transaction tx() = 0;

View File

@ -117,8 +117,11 @@ void WorkloadBase::execTransaction(TOpStartFct startFct,
}
// Execute a non-transactional database operation within the workload
void WorkloadBase::execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError) {
doExecute(startFct, cont, {}, failOnError, false);
void WorkloadBase::execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant,
bool failOnError) {
doExecute(startFct, cont, tenant, failOnError, false);
}
void WorkloadBase::doExecute(TOpStartFct startFct,

View File

@ -125,7 +125,10 @@ protected:
bool failOnError = true);
// Execute a non-transactional database operation within the workload
void execOperation(TOpStartFct startFct, TTaskFct cont, bool failOnError = true);
void execOperation(TOpStartFct startFct,
TTaskFct cont,
std::optional<fdb::BytesRef> tenant = std::optional<fdb::BytesRef>(),
bool failOnError = true);
// Log an error message, increase error counter
void error(const std::string& msg);

View File

@ -677,7 +677,28 @@ public:
}
};
class Tenant final {
// Handle this as an abstract class instead of interface to preserve lifetime of fdb objects owned by Tenant and
// Database.
class IDatabaseOps {
public:
virtual ~IDatabaseOps() = default;
virtual Transaction createTransaction() = 0;
virtual TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) = 0;
virtual TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin,
KeyRef end,
int rangeLimit) = 0;
virtual TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) = 0;
virtual TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin,
KeyRef end,
int64_t version,
bool force) = 0;
virtual TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) = 0;
};
class Tenant final : public IDatabaseOps {
friend class Database;
std::shared_ptr<native::FDBTenant> tenant;
@ -694,6 +715,14 @@ public:
Tenant& operator=(const Tenant&) noexcept = default;
Tenant() noexcept : tenant(nullptr) {}
void atomic_store(Tenant other) { std::atomic_store(&tenant, other.tenant); }
Tenant atomic_load() {
Tenant retVal;
retVal.tenant = std::atomic_load(&tenant);
return retVal;
}
static void createTenant(Transaction tr, BytesRef name) {
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_SPECIAL_KEY_SPACE_ENABLE_WRITES, BytesRef());
tr.setOption(FDBTransactionOption::FDB_TR_OPTION_LOCK_AWARE, BytesRef());
@ -715,7 +744,7 @@ public:
return tr.get(toBytesRef(fmt::format("{}{}", tenantManagementMapPrefix, toCharsRef(name))), false);
}
Transaction createTransaction() {
Transaction createTransaction() override {
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
auto err = Error(native::fdb_tenant_create_transaction(tenant.get(), &tx_native));
if (err)
@ -723,14 +752,49 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("blobbifyRange from null tenant");
throw std::runtime_error("blobbifyRange() from null tenant");
return native::fdb_tenant_blobbify_range(tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!tenant)
throw std::runtime_error("unblobbifyRange() from null tenant");
return native::fdb_tenant_unblobbify_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!tenant)
throw std::runtime_error("listBlobbifiedRanges() from null tenant");
return native::fdb_tenant_list_blobbified_ranges(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!tenant)
throw std::runtime_error("verifyBlobRange() from null tenant");
return native::fdb_tenant_verify_blob_range(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!tenant)
throw std::runtime_error("purgeBlobGranules() from null tenant");
native::fdb_bool_t forceBool = force;
return native::fdb_tenant_purge_blob_granules(
tenant.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!tenant)
throw std::runtime_error("waitPurgeGranulesComplete() from null tenant");
return native::fdb_tenant_wait_purge_granules_complete(tenant.get(), purgeKey.data(), intSize(purgeKey));
}
};
class Database {
class Database : public IDatabaseOps {
friend class Tenant;
std::shared_ptr<native::FDBDatabase> db;
@ -789,7 +853,7 @@ public:
return Tenant(tenant_native);
}
Transaction createTransaction() {
Transaction createTransaction() override {
if (!db)
throw std::runtime_error("create_transaction from null database");
auto tx_native = static_cast<native::FDBTransaction*>(nullptr);
@ -799,33 +863,33 @@ public:
return Transaction(tx_native);
}
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) {
TypedFuture<future_var::KeyRangeRefArray> listBlobbifiedRanges(KeyRef begin, KeyRef end, int rangeLimit) override {
if (!db)
throw std::runtime_error("listBlobbifiedRanges from null database");
return native::fdb_database_list_blobbified_ranges(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), rangeLimit);
}
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) {
TypedFuture<future_var::Int64> verifyBlobRange(KeyRef begin, KeyRef end, int64_t version) override {
if (!db)
throw std::runtime_error("verifyBlobRange from null database");
return native::fdb_database_verify_blob_range(
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version);
}
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> blobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("blobbifyRange from null database");
return native::fdb_database_blobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) {
TypedFuture<future_var::Bool> unblobbifyRange(KeyRef begin, KeyRef end) override {
if (!db)
throw std::runtime_error("unblobbifyRange from null database");
return native::fdb_database_unblobbify_range(db.get(), begin.data(), intSize(begin), end.data(), intSize(end));
}
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) {
TypedFuture<future_var::KeyRef> purgeBlobGranules(KeyRef begin, KeyRef end, int64_t version, bool force) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
native::fdb_bool_t forceBool = force;
@ -833,7 +897,7 @@ public:
db.get(), begin.data(), intSize(begin), end.data(), intSize(end), version, forceBool);
}
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) {
TypedFuture<future_var::None> waitPurgeGranulesComplete(KeyRef purgeKey) override {
if (!db)
throw std::runtime_error("purgeBlobGranules from null database");
return native::fdb_database_wait_purge_granules_complete(db.get(), purgeKey.data(), intSize(purgeKey));

View File

@ -54,7 +54,7 @@ def write_coverage_chunk(tr, path: Tuple[str, ...], metadata: Tuple[str, ...],
initialized = v.present()
for cov, covered in coverage:
if not initialized or covered:
tr.add(cov_dir.pack((cov.file, cov.line, cov.comment)), struct.pack('<I', 1 if covered else 0))
tr.add(cov_dir.pack((cov.file, cov.line, cov.comment, cov.rare)), struct.pack('<I', 1 if covered else 0))
return initialized
@ -80,9 +80,9 @@ def _read_coverage(tr, cov_path: Tuple[str, ...]) -> OrderedDict[Coverage, int]:
res = collections.OrderedDict()
cov_dir = fdb.directory.create_or_open(tr, cov_path)
for k, v in tr[cov_dir.range()]:
file, line, comment = cov_dir.unpack(k)
file, line, comment, rare = cov_dir.unpack(k)
count = struct.unpack('<I', v)[0]
res[Coverage(file, line, comment)] = count
res[Coverage(file, line, comment, rare)] = count
return res

View File

@ -19,6 +19,7 @@ class GlobalStatistics:
self.total_cpu_time: int = 0
self.total_test_runs: int = 0
self.total_missed_probes: int = 0
self.total_missed_nonrare_probes: int = 0
class EnsembleResults:
@ -40,6 +41,8 @@ class EnsembleResults:
self.coverage.append((cov, count))
if count <= self.ratio:
self.global_statistics.total_missed_probes += 1
if not cov.rare:
self.global_statistics.total_missed_nonrare_probes += 1
if self.min_coverage_hit is None or self.min_coverage_hit > count:
self.min_coverage_hit = count
self.coverage.sort(key=lambda x: (x[1], x[0].file, x[0].line))
@ -63,9 +66,12 @@ class EnsembleResults:
out.attributes['MinProbeHit'] = str(self.min_coverage_hit)
out.attributes['TotalProbes'] = str(len(self.coverage))
out.attributes['MissedProbes'] = str(self.global_statistics.total_missed_probes)
out.attributes['MissedNonRareProbes'] = str(self.global_statistics.total_missed_nonrare_probes)
for cov, count in self.coverage:
severity = 10 if count > self.ratio else 40
severity = 10
if count <= self.ratio:
severity = 30 if cov.rare else 40
if severity == 40:
errors += 1
if (severity == 40 and errors <= config.max_errors) or config.details:
@ -75,6 +81,7 @@ class EnsembleResults:
child.attributes['Line'] = str(cov.line)
child.attributes['Comment'] = '' if cov.comment is None else cov.comment
child.attributes['HitCount'] = str(count)
child.attributes['Rare'] = str(cov.rare)
out.append(child)
if config.details:

View File

@ -193,16 +193,17 @@ class JsonParser(Parser):
class Coverage:
def __init__(self, file: str, line: str | int, comment: str | None = None):
def __init__(self, file: str, line: str | int, comment: str | None = None, rare: bool = False):
self.file = file
self.line = int(line)
self.comment = comment
self.rare = rare
def to_tuple(self) -> Tuple[str, int, str | None]:
return self.file, self.line, self.comment
return self.file, self.line, self.comment, self.rare
def __eq__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() == other
elif isinstance(other, Coverage):
return self.to_tuple() == other.to_tuple()
@ -210,7 +211,7 @@ class Coverage:
return False
def __lt__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() < other
elif isinstance(other, Coverage):
return self.to_tuple() < other.to_tuple()
@ -218,7 +219,7 @@ class Coverage:
return False
def __le__(self, other) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() <= other
elif isinstance(other, Coverage):
return self.to_tuple() <= other.to_tuple()
@ -226,7 +227,7 @@ class Coverage:
return False
def __gt__(self, other: Coverage) -> bool:
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() > other
elif isinstance(other, Coverage):
return self.to_tuple() > other.to_tuple()
@ -234,7 +235,7 @@ class Coverage:
return False
def __ge__(self, other):
if isinstance(other, tuple) and len(other) == 3:
if isinstance(other, tuple) and len(other) == 4:
return self.to_tuple() >= other
elif isinstance(other, Coverage):
return self.to_tuple() >= other.to_tuple()
@ -242,7 +243,7 @@ class Coverage:
return False
def __hash__(self):
return hash((self.file, self.line, self.comment))
return hash((self.file, self.line, self.comment, self.rare))
class TraceFiles:
@ -378,6 +379,7 @@ class Summary:
child = SummaryTree('CodeCoverage')
child.attributes['File'] = k.file
child.attributes['Line'] = str(k.line)
child.attributes['Rare'] = k.rare
if not v:
child.attributes['Covered'] = '0'
if k.comment is not None and len(k.comment):
@ -595,7 +597,10 @@ class Summary:
comment = ''
if 'Comment' in attrs:
comment = attrs['Comment']
c = Coverage(attrs['File'], attrs['Line'], comment)
rare = False
if 'Rare' in attrs:
rare = bool(int(attrs['Rare']))
c = Coverage(attrs['File'], attrs['Line'], comment, rare)
if covered or c not in self.coverage:
self.coverage[c] = covered

View File

@ -116,12 +116,12 @@ If an individual zone is unhealthy, it may cause the throttling ratio for storag
### Client Rate Calculation
The smoothed per-client rate for each tag is tracked within `GlobalTagThrottlerImpl::PerTagStatistics`. Once a target rate has been computed, this is passed to `GlobalTagThrotterImpl::PerTagStatistics::updateAndGetPerClientRate` which adjusts the per-client rate. The per-client rate is meant to limit the busiest clients, so that at equilibrium, the per-client rate will remain constant and the sum of throughput from all clients will match the target rate.
## Testing
The `GlobalTagThrottling.toml` test provides a simple end-to-end test using the global tag throttler. Quotas are set using the internal tag quota API in the `GlobalTagThrottling` workload. This is run in parallel with the `ReadWrite` workload, which tags transactions. The number of `transaction_tag_throttled` errors is reported, along with the throughput, which should be roughly predictable based on the quota parameters chosen.
## Simulation Testing
The `ThroughputQuota.toml` test provides a simple end-to-end test using the global tag throttler. Quotas are set using the internal tag quota API in the `ThroughputQuota` workload. This is run with the `Cycle` workload, which randomly tags transactions.
In addition to this end-to-end test, there is a suite of unit tests with the `/GlobalTagThrottler/` prefix. These tests run in a mock environment, with mock storage servers providing simulated storage queue statistics and tag busyness reports. Mock clients simulate workload on these mock storage servers, and get throttling feedback directly from a global tag throttler which is monitoring the mock storage servers.
In each test, the `GlobalTagThrottlerTesting::monitor` function is used to periodically check whether or not a desired equilibrium state has been reached. If the desired state is reached and maintained for a sufficient period of time, the test passes. If the unit test is unable to reach this desired equilibrium state before a timeout, the test will fail. Commonly, the desired state is for the global tag throttler to report a client rate sufficiently close to the desired rate specified as an input to the `GlobalTagThrottlerTesting::rateIsNear` function.
In each unit test, the `GlobalTagThrottlerTesting::monitor` function is used to periodically check whether or not a desired equilibrium state has been reached. If the desired state is reached and maintained for a sufficient period of time, the test passes. If the unit test is unable to reach this desired equilibrium state before a timeout, the test will fail. Commonly, the desired state is for the global tag throttler to report a client rate sufficiently close to the desired rate specified as an input to the `GlobalTagThrottlerTesting::rateIsNear` function.
## Visibility

View File

@ -0,0 +1,47 @@
/*
* BlobRestoreCommand.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/FDBOptions.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last #include.
namespace fdb_cli {
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens) {
if (tokens.size() != 1 && tokens.size() != 2) {
printUsage(tokens[0]);
return false;
}
state bool success = false;
wait(store(success, localDb->blobRestore(normalKeys)));
if (success) {
fmt::print("Started blob restore for the full cluster. Please use 'status' command to check progress.\n");
} else {
fmt::print("Fail to start a new blob restore while there is a pending one.\n");
}
return success;
}
CommandFactory blobRestoreFactory("blobrestore", CommandHelp("blobrestore", "", ""));
} // namespace fdb_cli

View File

@ -19,11 +19,13 @@
*/
#include "fdbcli/fdbcli.actor.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "flow/actorcompiler.h" // This must be the last include
namespace {
enum class LimitType { RESERVED, TOTAL };
enum class QuotaType { RESERVED, TOTAL, STORAGE };
Optional<TransactionTag> parseTag(StringRef token) {
if (token.size() > CLIENT_KNOBS->MAX_TRANSACTION_TAG_LENGTH) {
@ -33,17 +35,19 @@ Optional<TransactionTag> parseTag(StringRef token) {
}
}
Optional<LimitType> parseLimitType(StringRef token) {
Optional<QuotaType> parseQuotaType(StringRef token) {
if (token == "reserved_throughput"_sr) {
return LimitType::RESERVED;
return QuotaType::RESERVED;
} else if (token == "total_throughput"_sr) {
return LimitType::TOTAL;
return QuotaType::TOTAL;
} else if (token == "storage"_sr) {
return QuotaType::STORAGE;
} else {
return {};
}
}
Optional<int64_t> parseLimitValue(StringRef token) {
Optional<int64_t> parseQuotaValue(StringRef token) {
try {
return std::stol(token.toString());
} catch (...) {
@ -51,20 +55,26 @@ Optional<int64_t> parseLimitValue(StringRef token) {
}
}
ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType) {
ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, QuotaType quotaType) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
state ThreadFuture<Optional<Value>> resultFuture =
tr->get(quotaType == QuotaType::STORAGE ? storageQuotaKey(tag) : ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
if (!v.present()) {
fmt::print("<empty>\n");
} else {
if (quotaType == QuotaType::STORAGE) {
int64_t storageQuota = BinaryReader::fromStringRef<int64_t>(v.get(), Unversioned());
fmt::print("{}\n", storageQuota);
return Void();
}
auto const quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (limitType == LimitType::TOTAL) {
if (quotaType == QuotaType::TOTAL) {
fmt::print("{}\n", quota.totalQuota);
} else if (limitType == LimitType::RESERVED) {
} else if (quotaType == QuotaType::RESERVED) {
fmt::print("{}\n", quota.reservedQuota);
}
}
@ -75,32 +85,36 @@ ACTOR Future<Void> getQuota(Reference<IDatabase> db, TransactionTag tag, LimitTy
}
}
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, LimitType limitType, int64_t value) {
ACTOR Future<Void> setQuota(Reference<IDatabase> db, TransactionTag tag, QuotaType quotaType, int64_t value) {
state Reference<ITransaction> tr = db->createTransaction();
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
if (quotaType == QuotaType::STORAGE) {
tr->set(storageQuotaKey(tag), BinaryWriter::toValue<int64_t>(value, Unversioned()));
} else {
state ThreadFuture<Optional<Value>> resultFuture = tr->get(ThrottleApi::getTagQuotaKey(tag));
Optional<Value> v = wait(safeThreadFutureToFuture(resultFuture));
ThrottleApi::TagQuotaValue quota;
if (v.present()) {
quota = ThrottleApi::TagQuotaValue::fromValue(v.get());
}
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (quotaType == QuotaType::TOTAL) {
// Round up to nearest page size
quota.totalQuota = ((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) *
CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
} else if (quotaType == QuotaType::RESERVED) {
// Round up to nearest page size
quota.reservedQuota = ((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) *
CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
}
// Internally, costs are stored in terms of pages, but in the API,
// costs are specified in terms of bytes
if (limitType == LimitType::TOTAL) {
// Round up to nearest page size
quota.totalQuota =
((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) * CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
} else if (limitType == LimitType::RESERVED) {
// Round up to nearest page size
quota.reservedQuota =
((value - 1) / CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE + 1) * CLIENT_KNOBS->TAG_THROTTLING_PAGE_SIZE;
}
if (!quota.isValid()) {
throw invalid_throttle_quota_value();
}
ThrottleApi::setTagQuota(tr, tag, quota.reservedQuota, quota.totalQuota);
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully updated quota.\n");
return Void();
@ -115,6 +129,7 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
loop {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
try {
tr->clear(storageQuotaKey(tag));
tr->clear(ThrottleApi::getTagQuotaKey(tag));
wait(safeThreadFutureToFuture(tr->commit()));
fmt::print("Successfully cleared quota.\n");
@ -125,8 +140,8 @@ ACTOR Future<Void> clearQuota(Reference<IDatabase> db, TransactionTag tag) {
}
}
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput] | set <tag> "
"[reserved_throughput|total_throughput] <value> | clear <tag>]";
constexpr auto usage = "quota [get <tag> [reserved_throughput|total_throughput|storage] | set <tag> "
"[reserved_throughput|total_throughput|storage] <value> | clear <tag>]";
bool exitFailure() {
fmt::print(usage);
@ -150,22 +165,22 @@ ACTOR Future<bool> quotaCommandActor(Reference<IDatabase> db, std::vector<String
if (tokens.size() != 4) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
if (!limitType.present()) {
auto const quotaType = parseQuotaType(tokens[3]);
if (!quotaType.present()) {
return exitFailure();
}
wait(getQuota(db, tag.get(), limitType.get()));
wait(getQuota(db, tag.get(), quotaType.get()));
return true;
} else if (tokens[1] == "set"_sr) {
if (tokens.size() != 5) {
return exitFailure();
}
auto const limitType = parseLimitType(tokens[3]);
auto const limitValue = parseLimitValue(tokens[4]);
if (!limitType.present() || !limitValue.present()) {
auto const quotaType = parseQuotaType(tokens[3]);
auto const quotaValue = parseQuotaValue(tokens[4]);
if (!quotaType.present() || !quotaValue.present()) {
return exitFailure();
}
wait(setQuota(db, tag.get(), limitType.get(), limitValue.get()));
wait(setQuota(db, tag.get(), quotaType.get(), quotaValue.get()));
return true;
} else if (tokens[1] == "clear"_sr) {
if (tokens.size() != 3) {

View File

@ -1416,6 +1416,13 @@ ACTOR Future<int> cli(CLIOptions opt, LineNoise* plinenoise, Reference<ClusterCo
continue;
}
if (tokencmp(tokens[0], "blobrestore")) {
bool _result = wait(makeInterruptable(blobRestoreCommandActor(localDb, tokens)));
if (!_result)
is_error = true;
continue;
}
if (tokencmp(tokens[0], "unlock")) {
if ((tokens.size() != 2) || (tokens[1].size() != 32) ||
!std::all_of(tokens[1].begin(), tokens[1].end(), &isxdigit)) {

View File

@ -213,6 +213,9 @@ ACTOR Future<bool> blobRangeCommandActor(Database localDb,
ACTOR Future<bool> blobKeyCommandActor(Database localDb,
Optional<TenantMapEntry> tenantEntry,
std::vector<StringRef> tokens);
// blobrestore command
ACTOR Future<bool> blobRestoreCommandActor(Database localDb, std::vector<StringRef> tokens);
// maintenance command
ACTOR Future<bool> setHealthyZone(Reference<IDatabase> db, StringRef zoneId, double seconds, bool printWarning = false);
ACTOR Future<bool> clearHealthyZone(Reference<IDatabase> db,

View File

@ -137,6 +137,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota set green storage 98765'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == 'Successfully updated quota.'
command = 'quota get green total_throughput'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@ -147,6 +152,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == '16384'
command = 'quota get green storage'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '98765'
command = 'quota clear green'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
@ -157,6 +167,11 @@ def quota(logger):
logger.debug(command + ' : ' + output)
assert output == '<empty>'
command = 'quota get green storage'
output = run_fdbcli_command(command)
logger.debug(command + ' : ' + output)
assert output == '<empty>'
# Too few arguments, should log help message
command = 'quota get green'
output = run_fdbcli_command(command)

View File

@ -971,6 +971,11 @@ void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion,
// clearVersion as previous guy)
}
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange) {
SortedDeltasT deltasByKey;
sortDeltasByKey(deltasByVersion, fileRange, deltasByKey);
}
// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking
Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
const Standalone<GranuleDeltas>& deltas,

View File

@ -5924,7 +5924,6 @@ public:
printf("Restoring backup to version: %lld\n", (long long)targetVersion);
}
state int retryCount = 0;
state Reference<ReadYourWritesTransaction> tr(new ReadYourWritesTransaction(cx));
loop {
try {
@ -5948,17 +5947,9 @@ public:
wait(tr->commit());
break;
} catch (Error& e) {
if (e.code() == error_code_transaction_too_old) {
retryCount++;
}
if (e.code() == error_code_restore_duplicate_tag) {
throw;
}
if (g_network->isSimulated() && retryCount > 50) {
CODE_PROBE(true, "submitRestore simulation speedup");
// try to make the read window back to normal size (5 * version_per_sec)
g_simulator->speedUpSimulation = true;
}
wait(tr->onError(e));
}
}

View File

@ -2559,15 +2559,21 @@ bool schemaMatch(json_spirit::mValue const& schemaValue,
}
}
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota) {
void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantName);
auto key = storageQuotaKey(tenantGroupName);
tr.set(key, BinaryWriter::toValue<int64_t>(quota, Unversioned()));
}
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName) {
void clearStorageQuota(Transaction& tr, StringRef tenantGroupName) {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
auto key = storageQuotaKey(tenantGroupName);
tr.clear(key);
}
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName) {
tr->setOption(FDBTransactionOptions::READ_SYSTEM_KEYS);
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantName)));
state Optional<Value> v = wait(tr->get(storageQuotaKey(tenantGroupName)));
if (!v.present()) {
return Optional<int64_t>();
}

View File

@ -965,7 +965,8 @@ ACTOR Future<MonitorLeaderInfo> monitorProxiesOneGeneration(
allConnectionsFailed = false;
} else {
CODE_PROBE(rep.getError().code() == error_code_failed_to_progress,
"Coordinator cant talk to cluster controller");
"Coordinator cant talk to cluster controller",
probe::decoration::rare);
TraceEvent("MonitorProxiesConnectFailed")
.detail("Error", rep.getError().name())
.detail("Coordinator", clientLeaderServer.getAddressString());

View File

@ -4524,9 +4524,11 @@ Future<RangeResultFamily> getRange(Reference<TransactionState> trState,
output.readToBegin = readToBegin;
output.readThroughEnd = readThroughEnd;
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows)) {
if (BUGGIFY && limits.hasByteLimit() && output.size() > std::max(1, originalLimits.minRows) &&
(!std::is_same<GetKeyValuesFamilyRequest, GetMappedKeyValuesRequest>::value)) {
// Copy instead of resizing because TSS maybe be using output's arena for comparison. This only
// happens in simulation so it's fine
// disable it on prefetch, because boundary entries serve as continuations
RangeResultFamily copy;
int newSize =
deterministicRandom()->randomInt(std::max(1, originalLimits.minRows), output.size());
@ -4769,7 +4771,8 @@ static Future<Void> tssStreamComparison(Request request,
TSS_traceMismatch(mismatchEvent, request, ssReply.get(), tssReply.get());
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Mismatch in stream comparison");
"Tracing Full TSS Mismatch in stream comparison",
probe::decoration::rare);
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Mismatch in stream comparison and storing the rest in FDB");
@ -4811,7 +4814,7 @@ maybeDuplicateTSSStreamFragment(Request& req, QueueModel* model, RequestStream<R
Optional<TSSEndpointData> tssData = model->getTssData(ssStream->getEndpoint().token.first());
if (tssData.present()) {
CODE_PROBE(true, "duplicating stream to TSS");
CODE_PROBE(true, "duplicating stream to TSS", probe::decoration::rare);
resetReply(req);
// FIXME: optimize to avoid creating new netNotifiedQueueWithAcknowledgements for each stream duplication
RequestStream<Request> tssRequestStream(tssData.get().endpoint);
@ -9404,7 +9407,8 @@ void handleTSSChangeFeedMismatch(const ChangeFeedStreamRequest& request,
mismatchEvent.detail("TSSVersion", tssVersion);
CODE_PROBE(FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Full TSS Feed Mismatch in stream comparison");
"Tracing Full TSS Feed Mismatch in stream comparison",
probe::decoration::rare);
CODE_PROBE(!FLOW_KNOBS->LOAD_BALANCE_TSS_MISMATCH_TRACE_FULL,
"Tracing Partial TSS Feed Mismatch in stream comparison and storing the rest in FDB");
@ -10915,6 +10919,37 @@ Future<Standalone<VectorRef<KeyRangeRef>>> DatabaseContext::listBlobbifiedRanges
return listBlobbifiedRangesActor(Reference<DatabaseContext>::addRef(this), range, rangeLimit, tenantName);
}
ACTOR Future<bool> blobRestoreActor(Reference<DatabaseContext> cx, KeyRange range) {
state Database db(cx);
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(db);
loop {
try {
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr->get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (status.progress < 100) {
return false; // stop if there is in-progress restore.
}
}
Standalone<BlobRestoreStatus> status;
status.progress = 0;
Value newValue = blobRestoreCommandValueFor(status);
tr->set(key, newValue);
wait(tr->commit());
return true;
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
Future<bool> DatabaseContext::blobRestore(KeyRange range) {
return blobRestoreActor(Reference<DatabaseContext>::addRef(this), range);
}
int64_t getMaxKeySize(KeyRef const& key) {
return getMaxWriteKeySize(key, true);
}

View File

@ -1654,7 +1654,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
CODE_PROBE(true, "RYW range read limit 0", probe::decoration::rare);
CODE_PROBE(true, "RYW range read limit 0");
return RangeResult();
}
@ -1668,7 +1668,7 @@ Future<RangeResult> ReadYourWritesTransaction::getRange(KeySelector begin,
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
CODE_PROBE(true, "RYW range inverted", probe::decoration::rare);
CODE_PROBE(true, "RYW range inverted");
return RangeResult();
}
@ -1698,7 +1698,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
if (getDatabase()->apiVersionAtLeast(630)) {
if (specialKeys.contains(begin.getKey()) && specialKeys.begin <= end.getKey() &&
end.getKey() <= specialKeys.end) {
CODE_PROBE(true, "Special key space get range (getMappedRange)");
CODE_PROBE(true, "Special key space get range (getMappedRange)", probe::decoration::rare);
throw client_invalid_operation(); // Not support special keys.
}
} else {
@ -1720,7 +1720,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
// This optimization prevents nullptr operations from being added to the conflict range
if (limits.isReached()) {
CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)");
CODE_PROBE(true, "RYW range read limit 0 (getMappedRange)", probe::decoration::rare);
return MappedRangeResult();
}
@ -1734,7 +1734,7 @@ Future<MappedRangeResult> ReadYourWritesTransaction::getMappedRange(KeySelector
end.removeOrEqual(end.arena());
if (begin.offset >= end.offset && begin.getKey() >= end.getKey()) {
CODE_PROBE(true, "RYW range inverted (getMappedRange)");
CODE_PROBE(true, "RYW range inverted (getMappedRange)", probe::decoration::rare);
return MappedRangeResult();
}

View File

@ -296,7 +296,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip();
init( DD_TENANT_AWARENESS_ENABLED, false );
init( STORAGE_QUOTA_ENABLED, false ); if(isSimulated) STORAGE_QUOTA_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -809,18 +810,24 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( RANGESTREAM_LIMIT_BYTES, 2e6 ); if( randomize && BUGGIFY ) RANGESTREAM_LIMIT_BYTES = 1;
init( CHANGEFEEDSTREAM_LIMIT_BYTES, 1e6 ); if( randomize && BUGGIFY ) CHANGEFEEDSTREAM_LIMIT_BYTES = 1;
init( BLOBWORKERSTATUSSTREAM_LIMIT_BYTES, 1e4 ); if( randomize && BUGGIFY ) BLOBWORKERSTATUSSTREAM_LIMIT_BYTES = 1;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip() ? false : true;
init( ENABLE_CLEAR_RANGE_EAGER_READS, true ); if( randomize && BUGGIFY ) ENABLE_CLEAR_RANGE_EAGER_READS = deterministicRandom()->coinflip();
init( CHECKPOINT_TRANSFER_BLOCK_BYTES, 40e6 );
init( QUICK_GET_VALUE_FALLBACK, true );
init( QUICK_GET_KEY_VALUES_FALLBACK, true );
init( MAX_PARALLEL_QUICK_GET_VALUE, 50 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( STRICTLY_ENFORCE_BYTE_LIMIT, false); if( randomize && BUGGIFY ) STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
init( FRACTION_INDEX_BYTELIMIT_PREFETCH, 0.2); if( randomize && BUGGIFY ) FRACTION_INDEX_BYTELIMIT_PREFETCH = 0.01 + deterministicRandom()->random01();
init( MAX_PARALLEL_QUICK_GET_VALUE, 10 ); if ( randomize && BUGGIFY ) MAX_PARALLEL_QUICK_GET_VALUE = deterministicRandom()->randomInt(1, 100);
init( QUICK_GET_KEY_VALUES_LIMIT, 2000 );
init( QUICK_GET_KEY_VALUES_LIMIT_BYTES, 1e7 );
init( STORAGE_FEED_QUERY_HARD_LIMIT, 100000 );
// Read priority definitions in the form of a list of their relative concurrency share weights
init( STORAGESERVER_READ_PRIORITIES, "120,10,20,40,60" );
// The total concurrency which will be shared by active priorities according to their relative weights
init( STORAGE_SERVER_READ_CONCURRENCY, 70 );
// Priorities which each ReadType maps to, in enumeration order
init( STORAGESERVER_READ_RANKS, "0,2,1,1,1" );
init( STORAGESERVER_READ_PRIORITIES, "48,32,8" );
// The priority number which each ReadType maps to in enumeration order
// This exists for flexibility but assigning each ReadType to its own unique priority number makes the most sense
// The enumeration is currently: eager, fetch, low, normal, high
init( STORAGESERVER_READTYPE_PRIORITY_MAP, "0,1,2,3,4" );
//Wait Failure
init( MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS, 250 ); if( randomize && BUGGIFY ) MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS = 2;
@ -944,7 +951,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REDWOOD_HISTOGRAM_INTERVAL, 30.0 );
init( REDWOOD_EVICT_UPDATED_PAGES, true ); if( randomize && BUGGIFY ) { REDWOOD_EVICT_UPDATED_PAGES = false; }
init( REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT, 2 ); if( randomize && BUGGIFY ) { REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT = deterministicRandom()->randomInt(1, 7); }
init( REDWOOD_PRIORITY_LAUNCHS, "32,32,32,32" );
init( REDWOOD_IO_PRIORITIES, "32,32,32,32" );
init( REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT, false );
// Server request latency measurement
@ -1018,6 +1025,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 );
init( BLOB_FULL_RESTORE_MODE, false );
init( BLOB_MIGRATOR_CHECK_INTERVAL, isSimulated ? 1.0 : 5.0);
init( BLOB_MANIFEST_RW_ROWS, isSimulated ? 10 : 1000);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );

View File

@ -1660,11 +1660,41 @@ BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value) {
return interf;
}
const KeyRangeRef blobRestoreCommandKeys("\xff\x02/blobRestoreCommand/"_sr, "\xff\x02/blobRestoreCommand0"_sr);
const Value blobRestoreCommandKeyFor(const KeyRangeRef range) {
BinaryWriter wr(AssumeVersion(ProtocolVersion::withBlobGranule()));
wr.serializeBytes(blobRestoreCommandKeys.begin);
wr << range;
return wr.toValue();
}
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key) {
KeyRange range;
BinaryReader reader(key.removePrefix(blobRestoreCommandKeys.begin),
AssumeVersion(ProtocolVersion::withBlobGranule()));
reader >> range;
return range;
}
const Value blobRestoreCommandValueFor(BlobRestoreStatus status) {
BinaryWriter wr(IncludeVersion(ProtocolVersion::withBlobGranule()));
wr << status;
return wr.toValue();
}
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value) {
Standalone<BlobRestoreStatus> status;
BinaryReader reader(value, IncludeVersion());
reader >> status;
return status;
}
const KeyRangeRef storageQuotaKeys("\xff/storageQuota/"_sr, "\xff/storageQuota0"_sr);
const KeyRef storageQuotaPrefix = storageQuotaKeys.begin;
Key storageQuotaKey(StringRef tenantName) {
return tenantName.withPrefix(storageQuotaPrefix);
Key storageQuotaKey(StringRef tenantGroupName) {
return tenantGroupName.withPrefix(storageQuotaPrefix);
}
const KeyRangeRef idempotencyIdKeys("\xff\x02/idmp/"_sr, "\xff\x02/idmp0"_sr);

View File

@ -18,6 +18,7 @@
* limitations under the License.
*/
#include "fdbrpc/Msgpack.h"
#include "fdbclient/Tracing.h"
#include "flow/IRandom.h"
#include "flow/UnitTest.h"
@ -79,41 +80,6 @@ struct LogfileTracer : ITracer {
}
};
struct TraceRequest {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "TracingSpanResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
// A server listening for UDP trace messages, run only in simulation.
ACTOR Future<Void> simulationStartServer() {
// We're going to force the address to be loopback regardless of FLOW_KNOBS->TRACING_UDP_LISTENER_ADDR
@ -167,146 +133,89 @@ ACTOR Future<Void> traceLog(int* pendingMessages, bool* sendError) {
struct UDPTracer : public ITracer {
// Serializes span fields as an array into the supplied TraceRequest
// buffer.
void serialize_span(const Span& span, TraceRequest& request) {
void serialize_span(const Span& span, MsgpackBuffer& buf) {
uint16_t size = 12;
request.write_byte(size | 0b10010000); // write as array
serialize_value(span.context.traceID.first(), request, 0xcf); // trace id
serialize_value(span.context.traceID.second(), request, 0xcf); // trace id
serialize_value(span.context.spanID, request, 0xcf); // spanid
buf.write_byte(size | 0b10010000); // write as array
serialize_value(span.context.traceID.first(), buf, 0xcf); // trace id
serialize_value(span.context.traceID.second(), buf, 0xcf); // trace id
serialize_value(span.context.spanID, buf, 0xcf); // spanid
// parent span id
serialize_value(span.parentContext.spanID, request, 0xcf); // spanId
serialize_value(span.parentContext.spanID, buf, 0xcf); // spanId
// Payload
serialize_string(span.location.name.toString(), request);
serialize_value(span.begin, request, 0xcb); // start time
serialize_value(span.end, request, 0xcb); // end
serialize_string(span.location.name.toString(), buf);
serialize_value(span.begin, buf, 0xcb); // start time
serialize_value(span.end, buf, 0xcb); // end
// Kind
serialize_value(span.kind, request, 0xcc);
serialize_value(span.kind, buf, 0xcc);
// Status
serialize_value(span.status, request, 0xcc);
serialize_value(span.status, buf, 0xcc);
// Links
serialize_vector(span.links, request);
serialize_vector(span.links, buf);
// Events
serialize_vector(span.events, request);
serialize_vector(span.events, buf);
// Attributes
serialize_map(span.attributes, request);
serialize_map(span.attributes, buf);
}
private:
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, TraceRequest& request, uint8_t msgpack_type) {
request.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
request.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, TraceRequest& request) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
request.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
request.write_byte(0xd9);
request.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
request.write_byte(0xda);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeString")
.detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
request.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, TraceRequest& request) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), request);
}
// Writes the given vector of linked SpanContext's to the request. If the vector is
// empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<SpanContext>& vec, MsgpackBuffer& buf) {
int size = vec.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
request.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& link : vec) {
serialize_value(link.traceID.first(), request, 0xcf); // trace id
serialize_value(link.traceID.second(), request, 0xcf); // trace id
serialize_value(link.spanID, request, 0xcf); // spanid
serialize_value(link.traceID.first(), buf, 0xcf); // trace id
serialize_value(link.traceID.second(), buf, 0xcf); // trace id
serialize_value(link.spanID, buf, 0xcf); // spanid
}
}
// Writes the given vector of linked SpanContext's to the request. If the vector is
// Writes the given vector of linked SpanEventRef's to the request. If the vector is
// empty, the request is not modified.
inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<SpanEventRef>& vec, MsgpackBuffer& buf) {
int size = vec.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10010000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
request.write_byte(0xdc);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
request.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& event : vec) {
serialize_string(event.name.toString(), request); // event name
serialize_value(event.time, request, 0xcb); // event time
serialize_vector(event.attributes, request);
serialize_string(event.name.toString(), buf); // event name
serialize_value(event.time, buf, 0xcb); // event time
serialize_vector(event.attributes, buf);
}
}
inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, TraceRequest& request) {
inline void serialize_vector(const SmallVectorRef<KeyValueRef>& vals, MsgpackBuffer& buf) {
int size = vals.size();
if (size <= 15) {
// N.B. We're actually writing this out as a fixmap here in messagepack format!
// fixmap 1000xxxx 0x80 - 0x8f
request.write_byte(static_cast<uint8_t>(size) | 0b10000000);
buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
for (const auto& kv : vals) {
serialize_string(kv.key.toString(), request);
serialize_string(kv.value.toString(), request);
}
}
template <class Map>
inline void serialize_map(const Map& map, TraceRequest& request) {
int size = map.size();
if (size <= 15) {
request.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "TracingSpanSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), request);
serialize_string(value.begin(), value.size(), request);
serialize_string(kv.key.toString(), buf);
serialize_string(kv.value.toString(), buf);
}
}
};
@ -336,9 +245,9 @@ ACTOR Future<Void> fastTraceLogger(int* unreadyMessages, int* failedMessages, in
struct FastUDPTracer : public UDPTracer {
FastUDPTracer()
: unready_socket_messages_(0), failed_messages_(0), total_messages_(0), socket_fd_(-1), send_error_(false) {
request_ = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
request_ = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
}
TracerType type() const override { return TracerType::NETWORK_LOSSY; }
@ -394,7 +303,7 @@ struct FastUDPTracer : public UDPTracer {
}
private:
TraceRequest request_;
MsgpackBuffer request_;
int unready_socket_messages_;
int failed_messages_;
@ -657,9 +566,9 @@ TEST_CASE("/flow/Tracing/FastUDPMessagePackEncoding") {
IKnobCollection::getMutableGlobalKnobCollection().setKnob("tracing_span_attributes_enabled",
KnobValueRef::create(bool{ true }));
Span span1("encoded_span"_loc);
auto request = TraceRequest{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
auto request = MsgpackBuffer{ .buffer = std::make_unique<uint8_t[]>(kTraceBufferSize),
.data_size = 0,
.buffer_size = kTraceBufferSize };
auto tracer = FastUDPTracer();
tracer.serialize_span(span1, request);
auto data = request.buffer.get();

View File

@ -313,4 +313,15 @@ struct BlobManifest {
}
};
// Defines blob restore status
struct BlobRestoreStatus {
constexpr static FileIdentifier file_identifier = 378657;
int progress;
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, progress);
}
};
#endif

View File

@ -56,4 +56,7 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk,
std::string randomBGFilename(UID blobWorkerID, UID granuleID, Version version, std::string suffix);
#endif
// For benchmark testing only. It should never be called in prod.
void sortDeltasByKey(const Standalone<GranuleDeltas>& deltasByVersion, const KeyRangeRef& fileRange);
#endif

View File

@ -403,6 +403,7 @@ public:
Future<Version> verifyBlobRange(const KeyRange& range,
Optional<Version> version,
Optional<TenantName> tenantName = {});
Future<bool> blobRestore(const KeyRange range);
// private:
explicit DatabaseContext(Reference<AsyncVar<Reference<IClusterConnectionRecord>>> connectionRecord,

View File

@ -163,9 +163,10 @@ bool schemaMatch(json_spirit::mValue const& schema,
// storage nodes
ACTOR Future<Void> mgmtSnapCreate(Database cx, Standalone<StringRef> snapCmd, UID snapUID);
// Set and get the storage quota per tenant
void setStorageQuota(Transaction& tr, StringRef tenantName, int64_t quota);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantName);
// Set/clear/get the storage quota for the given tenant group
void setStorageQuota(Transaction& tr, StringRef tenantGroupName, int64_t quota);
void clearStorageQuota(Transaction& tr, StringRef tenantGroupName);
ACTOR Future<Optional<int64_t>> getStorageQuota(Transaction* tr, StringRef tenantGroupName);
#include "flow/unactorcompiler.h"
#endif

View File

@ -237,6 +237,8 @@ public:
int64_t
DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC; // Minimal age of a correct-configured server before it's chosen to be wiggled
bool DD_TENANT_AWARENESS_ENABLED;
bool STORAGE_QUOTA_ENABLED; // Whether storage quota enforcement for tenant groups and all the relevant storage
// usage / quota monitors are enabled.
int TENANT_CACHE_LIST_REFRESH_INTERVAL; // How often the TenantCache is refreshed
int TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL; // How often the storage bytes used by each tenant is refreshed
// in the TenantCache
@ -761,14 +763,16 @@ public:
bool ENABLE_CLEAR_RANGE_EAGER_READS;
bool QUICK_GET_VALUE_FALLBACK;
bool QUICK_GET_KEY_VALUES_FALLBACK;
bool STRICTLY_ENFORCE_BYTE_LIMIT;
double FRACTION_INDEX_BYTELIMIT_PREFETCH;
int MAX_PARALLEL_QUICK_GET_VALUE;
int CHECKPOINT_TRANSFER_BLOCK_BYTES;
int QUICK_GET_KEY_VALUES_LIMIT;
int QUICK_GET_KEY_VALUES_LIMIT_BYTES;
int STORAGE_FEED_QUERY_HARD_LIMIT;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READ_RANKS;
std::string STORAGESERVER_READ_PRIORITIES;
int STORAGE_SERVER_READ_CONCURRENCY;
std::string STORAGESERVER_READTYPE_PRIORITY_MAP;
// Wait Failure
int MAX_OUTSTANDING_WAIT_FAILURE_REQUESTS;
@ -917,7 +921,7 @@ public:
int REDWOOD_DECODECACHE_REUSE_MIN_HEIGHT; // Minimum height for which to keep and reuse page decode caches
bool REDWOOD_SPLIT_ENCRYPTED_PAGES_BY_TENANT; // Whether to split pages by tenant if encryption is enabled
std::string REDWOOD_PRIORITY_LAUNCHS;
std::string REDWOOD_IO_PRIORITIES;
// Server request latency measurement
int LATENCY_SAMPLE_SIZE;
@ -992,6 +996,7 @@ public:
double BLOB_MANIFEST_BACKUP_INTERVAL;
bool BLOB_FULL_RESTORE_MODE;
double BLOB_MIGRATOR_CHECK_INTERVAL;
int BLOB_MANIFEST_RW_ROWS;
// Blob metadata
int64_t BLOB_METADATA_CACHE_TTL;

View File

@ -710,11 +710,18 @@ UID decodeBlobWorkerListKey(KeyRef const& key);
const Value blobWorkerListValue(BlobWorkerInterface const& interface);
BlobWorkerInterface decodeBlobWorkerListValue(ValueRef const& value);
// Blob restore command
extern const KeyRangeRef blobRestoreCommandKeys;
const Value blobRestoreCommandKeyFor(const KeyRangeRef range);
const KeyRange decodeBlobRestoreCommandKeyFor(const KeyRef key);
const Value blobRestoreCommandValueFor(BlobRestoreStatus status);
Standalone<BlobRestoreStatus> decodeBlobRestoreStatus(ValueRef const& value);
// Storage quota per tenant
// "\xff/storageQuota/[[tenantName]]" := "[[quota]]"
// "\xff/storageQuota/[[tenantGroupName]]" := "[[quota]]"
extern const KeyRangeRef storageQuotaKeys;
extern const KeyRef storageQuotaPrefix;
Key storageQuotaKey(StringRef tenantName);
Key storageQuotaKey(StringRef tenantGroupName);
extern const KeyRangeRef idempotencyIdKeys;
extern const KeyRef idempotencyIdsExpiredVersion;

View File

@ -69,7 +69,7 @@ TEST_CASE("/flow/buggifiedDelay") {
});
wait(f1 && f2);
if (last == 1) {
CODE_PROBE(true, "Delays can become ready out of order");
CODE_PROBE(true, "Delays can become ready out of order", probe::decoration::rare);
return Void();
}
}

View File

@ -216,7 +216,7 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
Arena arena;
authz::jwt::TokenRef t;
if (!authz::jwt::parseToken(arena, t, token)) {
CODE_PROBE(true, "Token can't be parsed");
CODE_PROBE(true, "Token can't be parsed", probe::decoration::rare);
TraceEvent(SevWarn, "InvalidToken")
.detail("From", peer)
.detail("Reason", "ParseError")
@ -225,35 +225,35 @@ bool TokenCacheImpl::validateAndAdd(double currentTime, StringRef token, Network
}
auto key = FlowTransport::transport().getPublicKeyByName(t.keyId);
if (!key.present()) {
CODE_PROBE(true, "Token referencing non-existing key");
CODE_PROBE(true, "Token referencing non-existing key", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("UnknownKey", t);
return false;
} else if (!t.issuedAtUnixTime.present()) {
CODE_PROBE(true, "Token has no issued-at field");
CODE_PROBE(true, "Token has no issued-at field", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoIssuedAt", t);
return false;
} else if (!t.expiresAtUnixTime.present()) {
CODE_PROBE(true, "Token has no expiration time");
CODE_PROBE(true, "Token has no expiration time", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoExpirationTime", t);
return false;
} else if (double(t.expiresAtUnixTime.get()) <= currentTime) {
CODE_PROBE(true, "Expired token");
CODE_PROBE(true, "Expired token", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("Expired", t);
return false;
} else if (!t.notBeforeUnixTime.present()) {
CODE_PROBE(true, "Token has no not-before field");
CODE_PROBE(true, "Token has no not-before field", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoNotBefore", t);
return false;
} else if (double(t.notBeforeUnixTime.get()) > currentTime) {
CODE_PROBE(true, "Tokens not-before is in the future");
CODE_PROBE(true, "Tokens not-before is in the future", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("TokenNotYetValid", t);
return false;
} else if (!t.tenants.present()) {
CODE_PROBE(true, "Token with no tenants");
CODE_PROBE(true, "Token with no tenants", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("NoTenants", t);
return false;
} else if (!authz::jwt::verifyToken(token, key.get())) {
CODE_PROBE(true, "Token with invalid signature");
CODE_PROBE(true, "Token with invalid signature", probe::decoration::rare);
TRACE_INVALID_PARSED_TOKEN("InvalidSignature", t);
return false;
} else {
@ -300,7 +300,7 @@ bool TokenCacheImpl::validate(TenantNameRef name, StringRef token) {
}
}
if (!tenantFound) {
CODE_PROBE(true, "Valid token doesn't reference tenant");
CODE_PROBE(true, "Valid token doesn't reference tenant", probe::decoration::rare);
TraceEvent(SevWarn, "TenantTokenMismatch").detail("From", peer).detail("Tenant", name.toString());
return false;
}

View File

@ -0,0 +1,157 @@
/*
* Msgpack.h
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FDBRPC_MSGPACK_H
#define FDBRPC_MSGPACK_H
#include <limits>
#pragma once
#include <memory>
#include <algorithm>
#include "flow/Trace.h"
#include "flow/Error.h"
#include "flow/network.h"
struct MsgpackBuffer {
std::unique_ptr<uint8_t[]> buffer;
// Amount of data in buffer (bytes).
std::size_t data_size;
// Size of buffer (bytes).
std::size_t buffer_size;
void write_byte(uint8_t byte) { write_bytes(&byte, 1); }
// This assumes that pos <= data_size
void edit_byte(uint8_t byte, size_t pos) { buffer[pos] = byte; }
void write_bytes(const uint8_t* buf, std::size_t n) {
resize(n);
std::copy(buf, buf + n, buffer.get() + data_size);
data_size += n;
}
void resize(std::size_t n) {
if (data_size + n <= buffer_size) {
return;
}
std::size_t size = buffer_size;
while (size < data_size + n) {
size *= 2;
}
TraceEvent(SevInfo, "MsgpackResizedBuffer").detail("OldSize", buffer_size).detail("NewSize", size);
auto new_buffer = std::make_unique<uint8_t[]>(size);
std::copy(buffer.get(), buffer.get() + data_size, new_buffer.get());
buffer = std::move(new_buffer);
buffer_size = size;
}
void reset() { data_size = 0; }
};
inline void serialize_bool(bool val, MsgpackBuffer& buf) {
if (val) {
buf.write_byte(0xc3);
} else {
buf.write_byte(0xc2);
}
}
// Writes the given value in big-endian format to the request. Sets the
// first byte to msgpack_type.
template <typename T>
inline void serialize_value(const T& val, MsgpackBuffer& buf, uint8_t msgpack_type) {
buf.write_byte(msgpack_type);
const uint8_t* p = reinterpret_cast<const uint8_t*>(std::addressof(val));
for (size_t i = 0; i < sizeof(T); ++i) {
buf.write_byte(p[sizeof(T) - i - 1]);
}
}
// Writes the given string to the request as a sequence of bytes. Inserts a
// format byte at the beginning of the string according to the its length,
// as specified by the msgpack specification.
inline void serialize_string(const uint8_t* c, int length, MsgpackBuffer& buf) {
if (length <= 31) {
// A size 0 string is ok. We still need to write a byte
// identifiying the item as a string, but can set the size to 0.
buf.write_byte(static_cast<uint8_t>(length) | 0b10100000);
} else if (length <= 255) {
buf.write_byte(0xd9);
buf.write_byte(static_cast<uint8_t>(length));
} else if (length <= 65535) {
buf.write_byte(0xda);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&length)[0]);
} else {
TraceEvent(SevWarn, "MsgpackSerializeString").detail("Failed to MessagePack encode very large string", length);
ASSERT_WE_THINK(false);
}
buf.write_bytes(c, length);
}
inline void serialize_string(const std::string& str, MsgpackBuffer& buf) {
serialize_string(reinterpret_cast<const uint8_t*>(str.data()), str.size(), buf);
}
template <typename T, typename F>
inline void serialize_vector(const std::vector<T>& vec, MsgpackBuffer& buf, F f) {
size_t size = vec.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10010000);
} else if (size <= 65535) {
buf.write_byte(0xdc);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else if (size <= std::numeric_limits<uint32_t>::max()) {
buf.write_byte(0xdd);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[3]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[2]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[1]);
buf.write_byte(reinterpret_cast<const uint8_t*>(&size)[0]);
} else {
TraceEvent(SevWarn, "MsgPackSerializeVector").detail("Failed to MessagePack encode large vector", size);
ASSERT_WE_THINK(false);
}
// Use the provided serializer function to serialize the individual types of the vector
for (const auto& val : vec) {
f(val, buf);
}
}
template <class Map>
inline void serialize_map(const Map& map, MsgpackBuffer& buf) {
int size = map.size();
if (size <= 15) {
buf.write_byte(static_cast<uint8_t>(size) | 0b10000000);
} else {
TraceEvent(SevWarn, "MsgPackSerializeMap").detail("Failed to MessagePack encode large map", size);
ASSERT_WE_THINK(false);
}
for (const auto& [key, value] : map) {
serialize_string(key.begin(), key.size(), buf);
serialize_string(value.begin(), value.size(), buf);
}
}
#endif

View File

@ -20,6 +20,7 @@
#ifndef FDBRPC_TIMED_REQUEST_H
#define FDBRPC_TIMED_REQUEST_H
#include "flow/network.h"
#pragma once
#include <fdbrpc/fdbrpc.h>
@ -35,7 +36,7 @@ public:
TimedRequest() {
if (!FlowTransport::isClient()) {
_requestTime = timer();
_requestTime = g_network->timer();
} else {
_requestTime = 0.0;
}

View File

@ -2361,7 +2361,7 @@ class UDPSimSocket : public IUDPSocket, ReferenceCounted<UDPSimSocket> {
NetworkAddress _localAddress;
bool randomDropPacket() {
auto res = deterministicRandom()->random01() < .000001;
CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly);
CODE_PROBE(res, "UDP packet drop", probe::context::sim2, probe::assert::simOnly, probe::decoration::rare);
return res;
}

View File

@ -654,7 +654,7 @@ private:
TraceEvent("WriteRecoveryKeySet", dbgid).log();
if (!initialCommit)
txnStateStore->set(KeyValueRef(m.param1, m.param2));
CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore");
CODE_PROBE(true, "Snapshot created, setting writeRecoveryKey in txnStateStore", probe::decoration::rare);
}
void checkSetTenantMapPrefix(MutationRef m) {

View File

@ -388,6 +388,8 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
Promise<Void> iAmReplaced;
bool isFullRestoreMode = false;
BlobManagerData(UID id,
Reference<AsyncVar<ServerDBInfo> const> dbInfo,
Database db,
@ -439,7 +441,7 @@ struct BlobManagerData : NonCopyable, ReferenceCounted<BlobManagerData> {
// if this granule is not an active granule, it can't be merged
auto gIt = workerAssignments.rangeContaining(range.begin);
if (gIt->begin() != range.begin || gIt->end() != range.end) {
CODE_PROBE(true, "non-active granule reported merge eligible, ignoring");
CODE_PROBE(true, "non-active granule reported merge eligible, ignoring", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print(
"BM {0} Ignoring Merge Candidate [{1} - {2}): range mismatch with active granule [{3} - {4})\n",
@ -1034,7 +1036,7 @@ static bool handleRangeIsAssign(Reference<BlobManagerData> bmData, RangeAssignme
if (assignment.assign.get().type == AssignRequestType::Continue) {
ASSERT(assignment.worker.present());
if (i.range() != assignment.keyRange || i.cvalue() != assignment.worker.get()) {
CODE_PROBE(true, "BM assignment out of date");
CODE_PROBE(true, "BM assignment out of date", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print("Out of date re-assign for ({0}, {1}). Assignment must have changed while "
"checking split.\n Reassign: [{2} - {3}): {4}\n Existing: [{5} - {6}): {7}\n",
@ -1601,10 +1603,10 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
if (retried && prevOwnerEpoch == bmData->epoch && prevGranuleID == granuleID &&
prevOwnerSeqno == std::numeric_limits<int64_t>::max()) {
// owner didn't change, last iteration of this transaction just succeeded but threw an error.
CODE_PROBE(true, "split too big adjustment succeeded after retry");
CODE_PROBE(true, "split too big adjustment succeeded after retry", probe::decoration::rare);
break;
}
CODE_PROBE(true, "split too big was since moved to another worker");
CODE_PROBE(true, "split too big was since moved to another worker", probe::decoration::rare);
if (BM_DEBUG) {
fmt::print("BM {0} re-evaluating initial split [{1} - {2}) too big: moved to another worker\n",
bmData->epoch,
@ -1838,7 +1840,7 @@ ACTOR Future<Void> maybeSplitRange(Reference<BlobManagerData> bmData,
wait(checkManagerLock(tr, bmData));
ForcedPurgeState purgeState = wait(getForcePurgedState(&tr->getTransaction(), granuleRange));
if (purgeState != ForcedPurgeState::NonePurged) {
CODE_PROBE(true, "Split stopped because of force purge");
CODE_PROBE(true, "Split stopped because of force purge", probe::decoration::rare);
TraceEvent("GranuleSplitCancelledForcePurge", bmData->id)
.detail("Epoch", bmData->epoch)
.detail("GranuleRange", granuleRange);
@ -2634,7 +2636,9 @@ ACTOR Future<Void> attemptMerges(Reference<BlobManagerData> bmData,
currentBytes + metrics.bytes > SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES ||
currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2) {
ASSERT(currentBytes <= SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES);
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2, "merge early because of key size");
CODE_PROBE(currentKeySumBytes >= CLIENT_KNOBS->VALUE_SIZE_LIMIT / 2,
"merge early because of key size",
probe::decoration::rare);
attemptStartMerge(bmData, currentCandidates);
currentCandidates.clear();
currentBytes = 0;
@ -3253,7 +3257,7 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
if (oldEpoch > newEpoch || (oldEpoch == newEpoch && oldSeqno > newSeqno)) {
newer.push_back(std::pair(old.range(), std::tuple(oldWorker, oldEpoch, oldSeqno)));
if (old.range() != newRange) {
CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries");
CODE_PROBE(true, "BM Recovery: BWs disagree on range boundaries", probe::decoration::rare);
anyConflicts = true;
}
} else {
@ -3287,7 +3291,8 @@ static void addAssignment(KeyRangeMap<std::tuple<UID, int64_t, int64_t>>& map,
std::get<0>(old.value()) = UID();
}
if (outOfDate.empty() || outOfDate.back() != std::pair(oldWorker, KeyRange(old.range()))) {
CODE_PROBE(true, "BM Recovery: Two workers claim ownership of same granule");
CODE_PROBE(
true, "BM Recovery: Two workers claim ownership of same granule", probe::decoration::rare);
outOfDate.push_back(std::pair(oldWorker, old.range()));
}
}
@ -3538,7 +3543,10 @@ ACTOR Future<Void> recoverBlobManager(Reference<BlobManagerData> bmData) {
bmData->startRecruiting.trigger();
bmData->initBStore();
if (isFullRestoreMode()) {
bool isFullRestore = wait(isFullRestoreMode(bmData->db, normalKeys));
bmData->isFullRestoreMode = isFullRestore;
if (bmData->isFullRestoreMode) {
wait(loadManifest(bmData->db, bmData->bstore));
int64_t epoc = wait(lastBlobEpoc(bmData->db, bmData->bstore));
@ -5298,11 +5306,8 @@ ACTOR Future<Void> backupManifest(Reference<BlobManagerData> bmData) {
bmData->initBStore();
loop {
bool pendingSplit = wait(hasPendingSplit(bmData));
if (!pendingSplit) {
wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
bmData->manifestDumperSeqNo++;
}
wait(dumpManifest(bmData->db, bmData->bstore, bmData->epoch, bmData->manifestDumperSeqNo));
bmData->manifestDumperSeqNo++;
wait(delay(SERVER_KNOBS->BLOB_MANIFEST_BACKUP_INTERVAL));
}
}
@ -5371,7 +5376,7 @@ ACTOR Future<Void> blobManager(BlobManagerInterface bmInterf,
if (SERVER_KNOBS->BG_ENABLE_MERGING) {
self->addActor.send(granuleMergeChecker(self));
}
if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !isFullRestoreMode()) {
if (SERVER_KNOBS->BLOB_MANIFEST_BACKUP && !self->isFullRestoreMode) {
self->addActor.send(backupManifest(self));
}

View File

@ -24,6 +24,7 @@
#include "fdbclient/BackupContainer.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbclient/ClientBooleanParams.h"
#include "fdbserver/Knobs.h"
#include "flow/FastRef.h"
#include "flow/Trace.h"
@ -60,7 +61,7 @@ struct BlobManifestFile {
int64_t seqNo{ 0 };
BlobManifestFile(const std::string& path) {
if (sscanf(path.c_str(), MANIFEST_FOLDER "/manifest.%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) {
if (sscanf(path.c_str(), MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%" SCNd64 ".%" SCNd64, &epoch, &seqNo) == 2) {
fileName = path;
}
}
@ -76,7 +77,7 @@ struct BlobManifestFile {
BlobManifestFile file(path);
return file.epoch > 0 && file.seqNo > 0;
};
BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER, filter));
BackupContainerFileSystem::FilesAndSizesT filesAndSizes = wait(reader->listFiles(MANIFEST_FOLDER "/", filter));
std::vector<BlobManifestFile> result;
for (auto& f : filesAndSizes) {
@ -107,6 +108,9 @@ public:
try {
state Standalone<BlobManifest> manifest;
Standalone<VectorRef<KeyValueRef>> rows = wait(getSystemKeys(self));
if (rows.size() == 0) {
return Void();
}
manifest.rows = rows;
Value data = encode(manifest);
wait(writeToFile(self, data));
@ -134,10 +138,23 @@ private:
blobRangeKeys // Key ranges managed by blob
};
for (auto range : ranges) {
// todo use getRangeStream for better performance
RangeResult result = wait(tr.getRange(range, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
for (auto& row : result) {
rows.push_back_deep(rows.arena(), KeyValueRef(row.key, row.value));
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(range.begin);
state KeySelectorRef end = firstGreaterOrEqual(range.end);
loop {
RangeResult result = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& row : result) {
rows.push_back_deep(rows.arena(), KeyValueRef(row.key, row.value));
}
if (!result.more) {
break;
}
if (result.readThrough.present()) {
begin = firstGreaterOrEqual(result.readThrough.get());
} else {
begin = firstGreaterThan(result.end()[-1].key);
}
}
}
return rows;
@ -149,11 +166,19 @@ private:
// Write data to blob manifest file
ACTOR static Future<Void> writeToFile(Reference<BlobManifestDumper> self, Value data) {
static int32_t lastWrittenBytes = 0;
if (data.size() == lastWrittenBytes) {
dprint("Skip writting blob manifest with same size {}\n", lastWrittenBytes);
return Void();
}
lastWrittenBytes = data.size();
state Reference<BackupContainerFileSystem> writer;
state std::string fullPath;
std::tie(writer, fullPath) = self->blobConn_->createForWrite(MANIFEST_FOLDER);
state std::string fileName = format(MANIFEST_FOLDER "/manifest.%lld.%lld", self->epoch_, self->seqNo_);
state std::string fileName =
format(MANIFEST_FOLDER "/" MANIFEST_FOLDER ".%lld.%lld", self->epoch_, self->seqNo_);
state Reference<IBackupFile> file = wait(writer->writeFile(fileName));
wait(file->append(data.begin(), data.size()));
wait(file->finish());
@ -208,7 +233,7 @@ public:
ACTOR static Future<Void> execute(Reference<BlobManifestLoader> self) {
try {
Value data = wait(readFromFile(self));
Standalone<BlobManifest> manifest = decode(data);
state Standalone<BlobManifest> manifest = decode(data);
wait(writeSystemKeys(self, manifest.rows));
BlobGranuleRestoreVersionVector _ = wait(listGranules(self));
} catch (Error& e) {
@ -227,13 +252,32 @@ public:
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
std::vector<KeyRangeRef> granules;
state Standalone<VectorRef<KeyRef>> blobRanges;
// Read all granules
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(blobGranuleMappingKeys.begin);
state KeySelectorRef end = firstGreaterOrEqual(blobGranuleMappingKeys.end);
loop {
RangeResult rows = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& row : rows) {
blobRanges.push_back_deep(blobRanges.arena(), row.key);
}
if (!rows.more) {
break;
}
if (rows.readThrough.present()) {
begin = firstGreaterOrEqual(rows.readThrough.get());
} else {
begin = firstGreaterThan(rows.end()[-1].key);
}
}
// check each granule range
state int i = 0;
auto limit = GetRangeLimits::BYTE_LIMIT_UNLIMITED;
state RangeResult blobRanges = wait(tr.getRange(blobGranuleMappingKeys, limit));
for (i = 0; i < blobRanges.size() - 1; i++) {
Key startKey = blobRanges[i].key.removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].key.removePrefix(blobGranuleMappingKeys.begin);
Key startKey = blobRanges[i].removePrefix(blobGranuleMappingKeys.begin);
Key endKey = blobRanges[i + 1].removePrefix(blobGranuleMappingKeys.begin);
state KeyRange granuleRange = KeyRangeRef(startKey, endKey);
try {
Standalone<BlobGranuleRestoreVersion> granule = wait(getGranule(&tr, granuleRange));
@ -296,17 +340,32 @@ private:
// Write system keys to database
ACTOR static Future<Void> writeSystemKeys(Reference<BlobManifestLoader> self, VectorRef<KeyValueRef> rows) {
state int start = 0;
state int end = 0;
for (start = 0; start < rows.size(); start = end) {
end = std::min(start + SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS, rows.size());
wait(writeSystemKeys(self, rows, start, end));
}
return Void();
}
// Write system keys from start index to end(exclusive), so that we don't exceed the limit of transaction limit
ACTOR static Future<Void> writeSystemKeys(Reference<BlobManifestLoader> self,
VectorRef<KeyValueRef> rows,
int start,
int end) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
for (auto& row : rows) {
tr.set(row.key, row.value);
for (int i = start; i < end; ++i) {
tr.set(rows[i].key, rows[i].value);
}
wait(tr.commit());
dprint("Blob manifest loaded {} rows\n", rows.size());
dprint("Blob manifest loaded rows from {} to {}\n", start, end);
TraceEvent("BlobManifestLoader").detail("RowStart", start).detail("RowEnd", end);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
@ -320,8 +379,7 @@ private:
KeyRange historyKeyRange = blobGranuleHistoryKeyRangeFor(range);
// reverse lookup so that the first row is the newest version
state RangeResult results =
wait(tr->getRange(historyKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED, Snapshot::False, Reverse::True));
wait(tr->getRange(historyKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED, Snapshot::True, Reverse::True));
for (KeyValueRef row : results) {
state KeyRange keyRange;
state Version version;
@ -363,24 +421,39 @@ private:
// List all files for given granule
ACTOR static Future<std::vector<GranuleFileVersion>> listGranuleFiles(Transaction* tr, UID granuleID) {
state std::vector<GranuleFileVersion> files;
state KeyRange fileKeyRange = blobGranuleFileKeyRangeFor(granuleID);
RangeResult results = wait(tr->getRange(fileKeyRange, GetRangeLimits::BYTE_LIMIT_UNLIMITED));
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(fileKeyRange.begin);
state KeySelectorRef end = firstGreaterOrEqual(fileKeyRange.end);
loop {
RangeResult results = wait(tr->getRange(begin, end, limits, Snapshot::True));
for (auto& row : results) {
UID gid;
Version version;
uint8_t fileType;
Standalone<StringRef> filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::vector<GranuleFileVersion> files;
for (auto& row : results) {
UID gid;
Version version;
uint8_t fileType;
Standalone<StringRef> filename;
int64_t offset;
int64_t length;
int64_t fullFileLength;
Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(row.key);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) = decodeBlobGranuleFileValue(row.value);
GranuleFileVersion vs = { version, fileType, filename.toString(), length };
files.push_back(vs);
std::tie(gid, version, fileType) = decodeBlobGranuleFileKey(row.key);
std::tie(filename, offset, length, fullFileLength, cipherKeysMeta) =
decodeBlobGranuleFileValue(row.value);
GranuleFileVersion vs = { version, fileType, filename.toString(), length };
files.push_back(vs);
}
if (!results.more) {
break;
}
if (results.readThrough.present()) {
begin = firstGreaterOrEqual(results.readThrough.get());
} else {
begin = firstGreaterThan(results.end()[-1].key);
}
}
return files;
}
@ -453,3 +526,40 @@ ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider
int64_t epoc = wait(BlobManifestLoader::lastBlobEpoc(loader));
return epoc;
}
// Return true if the given key range is restoring
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef keys) {
state Transaction tr(db);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
state GetRangeLimits limits(SERVER_KNOBS->BLOB_MANIFEST_RW_ROWS);
limits.minRows = 0;
state KeySelectorRef begin = firstGreaterOrEqual(blobRestoreCommandKeys.begin);
state KeySelectorRef end = firstGreaterOrEqual(blobRestoreCommandKeys.end);
loop {
RangeResult ranges = wait(tr.getRange(begin, end, limits, Snapshot::True));
for (auto& r : ranges) {
KeyRange keyRange = decodeBlobRestoreCommandKeyFor(r.key);
if (keyRange.contains(keys)) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(r.value);
return status.progress < 100; // progress is less than 100
}
}
if (!ranges.more) {
break;
}
if (ranges.readThrough.present()) {
begin = firstGreaterOrEqual(ranges.readThrough.get());
} else {
begin = firstGreaterThan(ranges.end()[-1].key);
}
}
return false;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}

View File

@ -21,6 +21,7 @@
#include "flow/ActorCollection.h"
#include "flow/FastRef.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/flow.h"
#include "fdbclient/StorageServerInterface.h"
#include "fdbclient/BlobConnectionProvider.h"
@ -63,14 +64,7 @@ public:
// Start migration
ACTOR static Future<Void> start(Reference<BlobMigrator> self) {
if (!isFullRestoreMode()) {
return Void();
}
wait(delay(10)); // TODO need to wait for a signal for readiness of blob manager
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
self->blobGranules_ = granules;
wait(checkIfReadyForMigration(self));
wait(prepare(self, normalKeys));
wait(advanceVersion(self));
wait(serverLoop(self));
@ -78,6 +72,28 @@ public:
}
private:
// Check if blob manifest is loaded so that blob migration can start
ACTOR static Future<Void> checkIfReadyForMigration(Reference<BlobMigrator> self) {
loop {
bool isFullRestore = wait(isFullRestoreMode(self->db_, normalKeys));
if (isFullRestore) {
BlobGranuleRestoreVersionVector granules = wait(listBlobGranules(self->db_, self->blobConn_));
if (!granules.empty()) {
self->blobGranules_ = granules;
for (BlobGranuleRestoreVersion granule : granules) {
TraceEvent("RestorableGranule")
.detail("GranuleId", granule.granuleID.toString())
.detail("KeyRange", granule.keyRange.toString())
.detail("Version", granule.version)
.detail("SizeInBytes", granule.sizeInBytes);
}
return Void();
}
}
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
// Prepare for data migration for given key range.
ACTOR static Future<Void> prepare(Reference<BlobMigrator> self, KeyRangeRef keys) {
// Register as a storage server, so that DataDistributor could start data movement after
@ -136,8 +152,9 @@ private:
}
}
if (owning) {
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
wait(krmSetRange(&tr, serverKeysPrefixFor(id), keys, serverKeysFalse));
dprint("Unassign {} from storage server {}\n", keys.toString(), id.toString());
TraceEvent("UnassignKeys").detail("Keys", keys.toString()).detail("From", id.toString());
}
}
wait(tr.commit());
@ -185,8 +202,10 @@ private:
// Calculated progress
int64_t total = sizeInBytes(self);
int progress = (total - incompleted) * 100 / total;
bool done = incompleted == 0;
dprint("Progress {} :{}%. done {}\n", serverID.toString(), progress, done);
state bool done = incompleted == 0;
dprint("Migration progress :{}%. done {}\n", progress, done);
TraceEvent("BlobMigratorProgress").detail("Progress", progress).detail("Done", done);
wait(updateProgress(self, normalKeys, progress));
return done;
} catch (Error& e) {
wait(tr.onError(e));
@ -194,6 +213,32 @@ private:
}
}
// Update restore progress
ACTOR static Future<Void> updateProgress(Reference<BlobMigrator> self, KeyRangeRef range, int progress) {
state Transaction tr(self->db_);
loop {
try {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
state Key key = blobRestoreCommandKeyFor(range);
Optional<Value> value = wait(tr.get(key));
if (value.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(value.get());
if (progress > status.progress) {
status.progress = progress;
Value updatedValue = blobRestoreCommandValueFor(status);
tr.set(key, updatedValue);
wait(tr.commit());
}
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
@ -207,6 +252,7 @@ private:
if (currentVersion <= expectedVersion) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned()));
dprint("Advance version from {} to {}\n", currentVersion, expectedVersion);
TraceEvent("AdvanceVersion").detail("Current", currentVersion).detail("New", expectedVersion);
wait(tr.commit());
}
return Void();
@ -218,7 +264,7 @@ private:
// Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(waitFailureServer(self->interf_.waitFailure.getFuture()));
self->actors_.add(logProgress(self));
self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self));
@ -226,6 +272,7 @@ private:
try {
choose {
when(HaltBlobMigratorRequest req = waitNext(self->interf_.haltBlobMigrator.getFuture())) {
dprint("Stopping blob migrator {}\n", self->interf_.id().toString());
req.reply.send(Void());
TraceEvent("BlobMigratorHalted", self->interf_.id()).detail("ReqID", req.requesterID);
break;
@ -237,6 +284,8 @@ private:
throw;
}
}
self->actors_.clear(true);
dprint("Stopped blob migrator {}\n", self->interf_.id().toString());
return Void();
}
@ -267,7 +316,7 @@ private:
req.reply.send(rep);
}
when(GetStorageMetricsRequest req = waitNext(ssi.getStorageMetrics.getFuture())) {
fmt::print("Handle GetStorageMetrics\n");
// fmt::print("Handle GetStorageMetrics\n");
StorageMetrics metrics;
metrics.bytes = sizeInBytes(self);
GetStorageMetricsReply resp;
@ -331,7 +380,7 @@ private:
req.reply.sendError(unsupported_operation());
}
when(UpdateCommitCostRequest req = waitNext(ssi.updateCommitCostRequest.getFuture())) {
dprint("Unsupported UpdateCommitCostRequest\n");
// dprint("Unsupported UpdateCommitCostRequest\n");
req.reply.sendError(unsupported_operation());
}
when(FetchCheckpointKeyValuesRequest req = waitNext(ssi.fetchCheckpointKeyValues.getFuture())) {
@ -358,9 +407,9 @@ private:
}
ACTOR static Future<Void> processStorageQueuingMetricsRequest(StorageQueuingMetricsRequest req) {
dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
// dprint("Unsupported StorageQueuingMetricsRequest\n");
// FIXME get rid of this delay. it's a temp solution to avoid starvaion scheduling of DD
// processes
wait(delay(1));
req.reply.sendError(unsupported_operation());
return Void();
@ -398,7 +447,8 @@ private:
// Main entry point
ACTOR Future<Void> blobMigrator(BlobMigratorInterface interf, Reference<AsyncVar<ServerDBInfo> const> dbInfo) {
fmt::print("Start blob migrator {} \n", interf.id().toString());
TraceEvent("StartBlobMigrator").detail("Interface", interf.id().toString());
dprint("Starting blob migrator {}\n", interf.id().toString());
try {
Reference<BlobMigrator> self = makeReference<BlobMigrator>(dbInfo, interf);
wait(BlobMigrator::start(self));

View File

@ -292,6 +292,8 @@ struct BlobWorkerData : NonCopyable, ReferenceCounted<BlobWorkerData> {
int64_t lastResidentMemory = 0;
double lastResidentMemoryCheckTime = -100.0;
bool isFullRestoreMode = false;
BlobWorkerData(UID id, Reference<AsyncVar<ServerDBInfo> const> dbInfo, Database db)
: id(id), db(db), tenantData(BGTenantMap(dbInfo)), dbInfo(dbInfo),
initialSnapshotLock(new FlowLock(SERVER_KNOBS->BLOB_WORKER_INITIAL_SNAPSHOT_PARALLELISM)),
@ -2146,7 +2148,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
}
// No need to start Change Feed in full restore mode
if (isFullRestoreMode())
if (bwData->isFullRestoreMode)
return Void();
checkMergeCandidate = granuleCheckMergeCandidate(bwData,
@ -2171,13 +2173,16 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// will get an exception if we try to read any popped data, killing this actor
readOldChangeFeed = true;
// because several feeds will be reading the same version range of this change feed at the same time, set
// cache result to true
oldChangeFeedFuture = bwData->db->getChangeFeedStream(cfData,
oldCFKey.get(),
startVersion + 1,
startState.changeFeedStartVersion,
metadata->keyRange,
bwData->changeFeedStreamReplyBufferSize,
false);
false,
{ ReadType::NORMAL, CacheResult::True });
} else {
readOldChangeFeed = false;
@ -2281,7 +2286,7 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
// popped up to V+1 is ok. Or in other words, if the last delta @ V, we only missed data
// at V+1 onward if popVersion >= V+2
if (metadata->bufferedDeltaVersion < metadata->activeCFData.get()->popVersion - 1) {
CODE_PROBE(true, "Blob Worker detected popped");
CODE_PROBE(true, "Blob Worker detected popped", probe::decoration::rare);
TraceEvent("BlobWorkerChangeFeedPopped", bwData->id)
.detail("Granule", metadata->keyRange)
.detail("GranuleID", startState.granuleID)
@ -2460,6 +2465,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
if (readOldChangeFeed) {
ASSERT(cfRollbackVersion + 1 < startState.changeFeedStartVersion);
ASSERT(oldCFKey.present());
// because several feeds will be reading the same version range of this change
// feed at the same time, set cache result to true
oldChangeFeedFuture =
bwData->db->getChangeFeedStream(cfData,
oldCFKey.get(),
@ -2467,7 +2474,8 @@ ACTOR Future<Void> blobGranuleUpdateFiles(Reference<BlobWorkerData> bwData,
startState.changeFeedStartVersion,
metadata->keyRange,
bwData->changeFeedStreamReplyBufferSize,
false);
false,
{ ReadType::NORMAL, CacheResult::True });
} else {
if (cfRollbackVersion + 1 < startState.changeFeedStartVersion) {
@ -3588,7 +3596,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
state Reference<GranuleMetadata> metadata = m;
// state Version granuleBeginVersion = req.beginVersion;
// skip waiting for CF ready for recovery mode
if (!isFullRestoreMode()) {
if (!bwData->isFullRestoreMode) {
choose {
when(wait(metadata->readable.getFuture())) {}
when(wait(metadata->cancelled.getFuture())) { throw wrong_shard_server(); }
@ -3646,7 +3654,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// this is an active granule query
loop {
// skip check since CF doesn't start for bare metal recovery mode
if (isFullRestoreMode()) {
if (bwData->isFullRestoreMode) {
break;
}
if (!metadata->activeCFData.get().isValid() || !metadata->cancelled.canBeSet()) {
@ -3689,7 +3697,7 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
// if feed was popped by another worker and BW only got empty versions, it wouldn't itself see that it
// got popped, but we can still reject the in theory this should never happen with other protections but
// it's a useful and inexpensive sanity check
if (!isFullRestoreMode()) {
if (!bwData->isFullRestoreMode) {
Version emptyVersion = metadata->activeCFData.get()->popVersion - 1;
if (req.readVersion > metadata->durableDeltaVersion.get() &&
emptyVersion > metadata->bufferedDeltaVersion) {
@ -3985,7 +3993,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
ForcedPurgeState purgeState = wait(fForcedPurgeState);
if (purgeState != ForcedPurgeState::NonePurged) {
CODE_PROBE(true, "Worker trying to open force purged granule");
CODE_PROBE(true, "Worker trying to open force purged granule", probe::decoration::rare);
if (BW_DEBUG) {
fmt::print("Granule [{0} - {1}) is force purged on BW {2}, abandoning\n",
req.keyRange.begin.printable(),
@ -3995,6 +4003,9 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
throw granule_assignment_conflict();
}
bool isFullRestore = wait(isFullRestoreMode(bwData->db, req.keyRange));
bwData->isFullRestoreMode = isFullRestore;
Optional<Value> prevLockValue = wait(fLockValue);
state bool hasPrevOwner = prevLockValue.present();
state bool createChangeFeed = false;
@ -4069,7 +4080,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
// for recovery mode - don't create change feed, don't create snapshot
if (isFullRestoreMode()) {
if (bwData->isFullRestoreMode) {
createChangeFeed = false;
info.doSnapshot = false;
GranuleFiles granuleFiles = wait(loadPreviousFiles(&tr, info.granuleID));
@ -4091,7 +4102,7 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
}
}
if (createChangeFeed && !isFullRestoreMode()) {
if (createChangeFeed && !bwData->isFullRestoreMode) {
// create new change feed for new version of granule
wait(updateChangeFeed(
&tr, granuleIDToCFKey(info.granuleID), ChangeFeedStatus::CHANGE_FEED_CREATE, req.keyRange));
@ -4103,7 +4114,8 @@ ACTOR Future<GranuleStartState> openGranule(Reference<BlobWorkerData> bwData, As
// If anything in previousGranules, need to do the handoff logic and set
// ret.previousChangeFeedId, and the previous durable version will come from the previous
// granules
if (info.history.present() && info.history.get().value.parentVersions.size() > 0 && !isFullRestoreMode()) {
if (info.history.present() && info.history.get().value.parentVersions.size() > 0 &&
!bwData->isFullRestoreMode) {
CODE_PROBE(true, "Granule open found parent");
if (info.history.get().value.parentVersions.size() == 1) { // split
state KeyRangeRef parentRange(info.history.get().value.parentBoundaries[0],

View File

@ -23,6 +23,7 @@
#include <map>
#include <memory>
#include <set>
#include <tuple>
#include <vector>
#include "fdbclient/FDBTypes.h"
@ -691,7 +692,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
WorkerDetails newMGWorker;
if (self->db.blobGranulesEnabled.get()) {
newBMWorker = findNewProcessForSingleton(self, ProcessClass::BlobManager, id_used);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
newMGWorker = findNewProcessForSingleton(self, ProcessClass::BlobMigrator, id_used);
}
}
@ -710,7 +711,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ProcessClass::Fitness bestFitnessForMG;
if (self->db.blobGranulesEnabled.get()) {
bestFitnessForBM = findBestFitnessForSingleton(self, newBMWorker, ProcessClass::BlobManager);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
bestFitnessForMG = findBestFitnessForSingleton(self, newMGWorker, ProcessClass::BlobManager);
}
}
@ -744,7 +745,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
bmHealthy = isHealthySingleton<BlobManagerInterface>(
self, newBMWorker, bmSingleton, bestFitnessForBM, self->recruitingBlobManagerID);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
mgHealthy = isHealthySingleton<BlobMigratorInterface>(
self, newMGWorker, mgSingleton, bestFitnessForMG, self->recruitingBlobMigratorID);
}
@ -775,7 +776,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
currBMProcessId = bmSingleton.interface.get().locality.processId();
newBMProcessId = newBMWorker.interf.locality.processId();
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
currMGProcessId = mgSingleton.interface.get().locality.processId();
newMGProcessId = newMGWorker.interf.locality.processId();
}
@ -792,7 +793,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (self->db.blobGranulesEnabled.get()) {
currPids.emplace_back(currBMProcessId);
newPids.emplace_back(newBMProcessId);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
currPids.emplace_back(currMGProcessId);
newPids.emplace_back(newMGProcessId);
}
@ -810,7 +811,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
if (!self->db.blobGranulesEnabled.get()) {
ASSERT(currColocMap[currBMProcessId] == 0);
ASSERT(newColocMap[newBMProcessId] == 0);
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
ASSERT(currColocMap[currMGProcessId] == 0);
ASSERT(newColocMap[newMGProcessId] == 0);
}
@ -836,7 +837,7 @@ void checkBetterSingletons(ClusterControllerData* self) {
ddSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && newColocMap[newBMProcessId] < currColocMap[currBMProcessId]) {
bmSingleton.recruit(self);
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() &&
} else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get() &&
newColocMap[newMGProcessId] < currColocMap[currMGProcessId]) {
mgSingleton.recruit(self);
} else if (SERVER_KNOBS->ENABLE_ENCRYPTION && newColocMap[newEKPProcessId] < currColocMap[currEKPProcessId]) {
@ -1404,13 +1405,13 @@ ACTOR Future<Void> registerWorker(RegisterWorkerRequest req,
self, w, currSingleton, registeringSingleton, self->recruitingRatekeeperID);
}
if (self->db.blobGranulesEnabled.get() && isFullRestoreMode() && req.blobManagerInterf.present()) {
if (self->db.blobGranulesEnabled.get() && req.blobManagerInterf.present()) {
auto currSingleton = BlobManagerSingleton(self->db.serverInfo->get().blobManager);
auto registeringSingleton = BlobManagerSingleton(req.blobManagerInterf);
haltRegisteringOrCurrentSingleton<BlobManagerInterface>(
self, w, currSingleton, registeringSingleton, self->recruitingBlobManagerID);
}
if (req.blobMigratorInterf.present()) {
if (req.blobMigratorInterf.present() && self->db.blobRestoreEnabled.get()) {
auto currSingleton = BlobMigratorSingleton(self->db.serverInfo->get().blobMigrator);
auto registeringSingleton = BlobMigratorSingleton(req.blobMigratorInterf);
haltRegisteringOrCurrentSingleton<BlobMigratorInterface>(
@ -2553,6 +2554,43 @@ ACTOR Future<int64_t> getNextBMEpoch(ClusterControllerData* self) {
}
}
ACTOR Future<Void> watchBlobRestoreCommand(ClusterControllerData* self) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(self->cx);
state Key blobRestoreCommandKey = blobRestoreCommandKeyFor(normalKeys);
loop {
try {
tr->reset();
tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
Optional<Value> blobRestoreCommand = wait(tr->get(blobRestoreCommandKey));
if (blobRestoreCommand.present()) {
Standalone<BlobRestoreStatus> status = decodeBlobRestoreStatus(blobRestoreCommand.get());
TraceEvent("WatchBlobRestoreCommand").detail("Progress", status.progress);
if (status.progress == 0) {
self->db.blobRestoreEnabled.set(true);
if (self->db.blobGranulesEnabled.get()) {
const auto& blobManager = self->db.serverInfo->get().blobManager;
if (blobManager.present()) {
BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId());
}
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
if (blobMigrator.present()) {
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
}
}
}
state Future<Void> watch = tr->watch(blobRestoreCommandKey);
wait(tr->commit());
wait(watch);
} catch (Error& e) {
wait(tr->onError(e));
}
}
}
ACTOR Future<Void> startBlobMigrator(ClusterControllerData* self, double waitTime) {
// If master fails at the same time, give it a chance to clear master PID.
// Also wait to avoid too many consecutive recruits in a small time window.
@ -2629,9 +2667,8 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
}
loop {
if (self->db.serverInfo->get().blobMigrator.present() && !self->recruitBlobMigrator.get()) {
state Future<Void> wfClient =
waitFailureClient(self->db.serverInfo->get().blobMigrator.get().ssi.waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
state Future<Void> wfClient = waitFailureClient(self->db.serverInfo->get().blobMigrator.get().waitFailure,
SERVER_KNOBS->BLOB_MIGRATOR_FAILURE_TIME);
loop {
choose {
when(wait(wfClient)) {
@ -2643,11 +2680,11 @@ ACTOR Future<Void> monitorBlobMigrator(ClusterControllerData* self) {
when(wait(self->recruitBlobMigrator.onChange())) {}
}
}
} else if (self->db.blobGranulesEnabled.get() && isFullRestoreMode()) {
} else if (self->db.blobGranulesEnabled.get() && self->db.blobRestoreEnabled.get()) {
// if there is no blob migrator present but blob granules are now enabled, recruit a BM
wait(startBlobMigrator(self, recruitThrottler.newRecruitment()));
} else {
wait(self->db.blobGranulesEnabled.onChange());
wait(self->db.blobGranulesEnabled.onChange() || self->db.blobRestoreEnabled.onChange());
}
}
}
@ -2778,7 +2815,7 @@ ACTOR Future<Void> monitorBlobManager(ClusterControllerData* self) {
const auto& blobManager = self->db.serverInfo->get().blobManager;
BlobManagerSingleton(blobManager)
.haltBlobGranules(self, blobManager.get().locality.processId());
if (isFullRestoreMode()) {
if (self->db.blobRestoreEnabled.get()) {
const auto& blobMigrator = self->db.serverInfo->get().blobMigrator;
BlobMigratorSingleton(blobMigrator).halt(self, blobMigrator.get().locality.processId());
}
@ -3079,8 +3116,9 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(monitorDataDistributor(&self));
self.addActor.send(monitorRatekeeper(&self));
self.addActor.send(monitorBlobManager(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobGranulesConfigKey(&self));
self.addActor.send(monitorBlobMigrator(&self));
self.addActor.send(watchBlobRestoreCommand(&self));
self.addActor.send(monitorConsistencyScan(&self));
self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self));

View File

@ -414,7 +414,8 @@ ACTOR Future<Void> commitBatcher(ProxyCommitData* commitData,
}
Optional<TenantNameRef> const& tenantName = req.tenantInfo.name;
if (tenantName.present() && commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED && tenantName.present() &&
commitData->tenantsOverStorageQuota.count(tenantName.get()) > 0) {
req.reply.sendError(storage_quota_exceeded());
continue;
}
@ -1309,7 +1310,7 @@ ACTOR Future<WriteMutationRefVar> writeMutationFetchEncryptKey(CommitBatchContex
wait(getLatestEncryptCipherKey(self->pProxyCommitData->db, domainId, p.first, BlobCipherMetrics::TLOG));
self->cipherKeys[domainId] = cipherKey;
CODE_PROBE(true, "Raw access mutation encryption");
CODE_PROBE(true, "Raw access mutation encryption", probe::decoration::rare);
ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
self->toCommit.writeTypedMessage(encryptedMutation);
@ -2971,7 +2972,9 @@ ACTOR Future<Void> commitProxyServerCore(CommitProxyInterface proxy,
proxy.expireIdempotencyId,
commitData.expectedIdempotencyIdCountForKey,
&commitData.idempotencyClears));
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
if (SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
addActor.send(monitorTenantsOverStorageQuota(proxy.id(), db, &commitData));
}
// wait for txnStateStore recovery
wait(success(commitData.txnStateStore->readValue(StringRef())));

View File

@ -316,7 +316,7 @@ class ConfigNodeImpl {
ACTOR static Future<Void> getConfigClasses(ConfigNodeImpl* self, ConfigTransactionGetConfigClassesRequest req) {
state Optional<CoordinatorsHash> locked = wait(getLocked(self));
if (locked.present()) {
CODE_PROBE(true, "attempting to read config classes from locked ConfigNode");
CODE_PROBE(true, "attempting to read config classes from locked ConfigNode", probe::decoration::rare);
req.reply.sendError(coordinators_changed());
return Void();
}
@ -360,7 +360,7 @@ class ConfigNodeImpl {
ACTOR static Future<Void> getKnobs(ConfigNodeImpl* self, ConfigTransactionGetKnobsRequest req) {
state Optional<CoordinatorsHash> locked = wait(getLocked(self));
if (locked.present()) {
CODE_PROBE(true, "attempting to read knobs from locked ConfigNode");
CODE_PROBE(true, "attempting to read knobs from locked ConfigNode", probe::decoration::rare);
req.reply.sendError(coordinators_changed());
return Void();
}

View File

@ -697,6 +697,9 @@ struct DDQueue : public IDDRelocationQueue {
RemoteTeamIsFull,
RemoteTeamIsNotHealthy,
NoAvailablePhysicalShard,
UnknownForceNew,
NoAnyHealthy,
DstOverloaded,
NumberOfTypes,
};
std::vector<int> retryFindDstReasonCount;
@ -1423,6 +1426,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
state double startTime = now();
state std::vector<UID> destIds;
state uint64_t debugID = deterministicRandom()->randomUInt64();
state bool enableShardMove = SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD;
try {
if (now() - self->lastInterval < 1.0) {
@ -1539,8 +1543,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
req.src = rd.src;
req.completeSources = rd.completeSources;
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 1) {
if (enableShardMove && tciIndex == 1) {
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
Optional<ShardsAffectedByTeamFailure::Team> remoteTeamWithPhysicalShard =
@ -1587,64 +1590,65 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
anyWithSource = true;
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In this
// case, we must re-select a remote team We set foundTeams = false to avoid finishing team
// selection Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select
// a remote team
if (enableShardMove) {
if (tciIndex == 1 && !forceToUseNewPhysicalShard) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an almost full remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
bool minAvailableSpaceRatio = bestTeam.first.get()->getMinAvailableSpaceRatio(true);
if (minAvailableSpaceRatio < SERVER_KNOBS->TARGET_AVAILABLE_SPACE_RATIO) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsFull;
foundTeams = false;
break;
}
}
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary
// team Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team In
// this case, we must re-select a remote team We set foundTeams = false to avoid
// finishing team selection Then, forceToUseNewPhysicalShard is set, which enforce to
// use getTeam to select a remote team
if (!bestTeam.first.get()->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
break;
}
}
bestTeams.emplace_back(bestTeam.first.get(), true);
// Always set bestTeams[i].second = true to disable optimization in data move between DCs
// for the correctness of PhysicalShardCollection
// Currently, enabling the optimization will break the invariant of PhysicalShardCollection
// Invariant: once a physical shard is created with a specific set of SSes, this SS set will
// never get changed.
if (tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
if (forceToUseNewPhysicalShard &&
retryFindDstReason == DDQueue::RetryFindDstReason::None) {
// This is an abnormally state where we try to create new physical shard, but we
// don't know why. This state is to track unknown reason for force creating new
// physical shard.
retryFindDstReason = DDQueue::RetryFindDstReason::UnknownForceNew;
}
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
} else {
bestTeams.emplace_back(bestTeam.first.get(), bestTeam.second);
}
// get physicalShardIDCandidate
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
tciIndex == 0) {
ASSERT(foundTeams);
ShardsAffectedByTeamFailure::Team primaryTeam =
ShardsAffectedByTeamFailure::Team(bestTeams[0].first->getServerIDs(), true);
physicalShardIDCandidate =
self->physicalShardCollection->determinePhysicalShardIDGivenPrimaryTeam(
primaryTeam, metrics, forceToUseNewPhysicalShard, debugID);
ASSERT(physicalShardIDCandidate != UID().first() &&
physicalShardIDCandidate != anonymousShardId.first());
}
}
tciIndex++;
}
// critical to the correctness of team selection by PhysicalShardCollection
// tryGetAvailableRemoteTeamWith() enforce to select a remote team paired with a primary team
// Thus, tryGetAvailableRemoteTeamWith() may select an unhealthy remote team
// In this case, we must re-select a remote team
// We set foundTeams = false to avoid finishing team selection
// Then, forceToUseNewPhysicalShard is set, which enforce to use getTeam to select a remote team
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD &&
bestTeams.size() > 1 && !forceToUseNewPhysicalShard) {
if (!bestTeams[1].first->isHealthy()) {
retryFindDstReason = DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy;
foundTeams = false;
}
}
// once we've found healthy candidate teams, make sure they're not overloaded with outstanding moves
// already
anyDestOverloaded = !canLaunchDest(bestTeams, rd.priority, self->destBusymap);
@ -1654,6 +1658,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
break;
}
if (retryFindDstReason == DDQueue::RetryFindDstReason::None && foundTeams) {
if (!anyHealthy) {
retryFindDstReason = DDQueue::RetryFindDstReason::NoAnyHealthy;
} else if (anyDestOverloaded) {
retryFindDstReason = DDQueue::RetryFindDstReason::DstOverloaded;
}
}
if (anyDestOverloaded) {
CODE_PROBE(true, "Destination overloaded throttled move");
destOverloadedCount++;
@ -1665,7 +1677,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
.detail("AnyDestOverloaded", anyDestOverloaded)
.detail("NumOfTeamCollections", self->teamCollections.size())
.detail("Servers", destServersString(bestTeams));
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (rd.isRestore() && destOverloadedCount > 50) {
throw data_move_dest_team_not_found();
}
@ -1689,14 +1701,14 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
// When forceToUseNewPhysicalShard = false, we get paired primary team and remote team
// However, this may be failed
// Any retry triggers to use new physicalShard which enters the normal routine
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
forceToUseNewPhysicalShard = true;
}
// TODO different trace event + knob for overloaded? Could wait on an async var for done moves
}
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
if (!rd.isRestore()) {
// when !rd.isRestore(), dataMoveId is just decided as physicalShardIDCandidate
// thus, update the physicalShardIDCandidate to related data structures
@ -1954,7 +1966,7 @@ ACTOR Future<Void> dataDistributionRelocator(DDQueue* self,
self->shardsAffectedByTeamFailure->finishMove(rd.keys);
relocationComplete.send(rd);
if (SERVER_KNOBS->SHARD_ENCODE_LOCATION_METADATA && SERVER_KNOBS->ENABLE_DD_PHYSICAL_SHARD) {
if (enableShardMove) {
// update physical shard collection
std::vector<ShardsAffectedByTeamFailure::Team> selectedTeams;
for (int i = 0; i < bestTeams.size(); i++) {
@ -2525,6 +2537,12 @@ ACTOR Future<Void> dataDistributionQueue(Reference<IDDTxnProcessor> db,
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsFull])
.detail("RemoteTeamIsNotHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::RemoteTeamIsNotHealthy])
.detail("UnknownForceNew",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::UnknownForceNew])
.detail("NoAnyHealthy",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAnyHealthy])
.detail("DstOverloaded",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::DstOverloaded])
.detail(
"NoAvailablePhysicalShard",
self.retryFindDstReasonCount[DDQueue::RetryFindDstReason::NoAvailablePhysicalShard]);

View File

@ -623,7 +623,9 @@ std::vector<RangeToSplit> findTenantShardBoundaries(KeyRangeMap<ShardTrackedData
result.emplace_back(shardContainingTenantEnd, faultLines);
}
} else {
CODE_PROBE(true, "Shards that contain tenant key range not split since shard stats are unavailable");
CODE_PROBE(true,
"Shards that contain tenant key range not split since shard stats are unavailable",
probe::decoration::rare);
}
}
@ -1358,7 +1360,7 @@ ACTOR Future<Void> fetchTopKShardMetrics(DataDistributionTracker* self, GetTopKM
when(wait(g_network->isSimulated() && BUGGIFY_WITH_PROB(0.01) ? Never()
: fetchTopKShardMetrics_impl(self, req))) {}
when(wait(delay(SERVER_KNOBS->DD_SHARD_METRICS_TIMEOUT))) {
CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT", probe::decoration::rare);
CODE_PROBE(true, "TopK DD_SHARD_METRICS_TIMEOUT");
req.reply.send(GetTopKMetricsReply());
}
}
@ -2087,4 +2089,4 @@ TEST_CASE("/DataDistributor/Tracker/FetchTopK") {
ASSERT(reply.minReadLoad == -1);
return Void();
}
}

View File

@ -588,7 +588,6 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<DDTeamCollection> primaryTeamCollection;
state Reference<DDTeamCollection> remoteTeamCollection;
state bool trackerCancelled;
state bool ddIsTenantAware = SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED;
loop {
trackerCancelled = false;
self->initialized = Promise<Void>();
@ -610,7 +609,7 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
state Reference<AsyncVar<bool>> processingUnhealthy(new AsyncVar<bool>(false));
state Reference<AsyncVar<bool>> processingWiggle(new AsyncVar<bool>(false));
if (ddIsTenantAware) {
if (SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED || SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
self->ddTenantCache = makeReference<TenantCache>(cx, self->ddId);
wait(self->ddTenantCache.get()->build());
}
@ -684,6 +683,8 @@ ACTOR Future<Void> dataDistribution(Reference<DataDistributor> self,
"DDTenantCacheMonitor",
self->ddId,
&normalDDQueueErrors()));
}
if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
actors.push_back(reportErrorsExcept(self->ddTenantCache.get()->monitorStorageQuota(),
"StorageQuotaTracker",
self->ddId,
@ -1320,7 +1321,7 @@ GetStorageWigglerStateReply getStorageWigglerStates(Reference<DataDistributor> s
TenantsOverStorageQuotaReply getTenantsOverStorageQuota(Reference<DataDistributor> self) {
TenantsOverStorageQuotaReply reply;
if (self->ddTenantCache.present()) {
if (self->ddTenantCache.present() && SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
reply.tenants = self->ddTenantCache.get()->getTenantsOverQuota();
}
return reply;
@ -1537,14 +1538,18 @@ ACTOR Future<Void> dataDistributor(DataDistributorInterface di, Reference<AsyncV
when(DistributorSnapRequest snapReq = waitNext(di.distributorSnapReq.getFuture())) {
auto& snapUID = snapReq.snapUID;
if (ddSnapReqResultMap.count(snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate finished snapshot request");
CODE_PROBE(true,
"Data distributor received a duplicate finished snapshot request",
probe::decoration::rare);
auto result = ddSnapReqResultMap[snapUID];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedDistributorSnapRequest")
.detail("SnapUID", snapUID)
.detail("Result", result.isError() ? result.getError().code() : 0);
} else if (ddSnapReqMap.count(snapReq.snapUID)) {
CODE_PROBE(true, "Data distributor received a duplicate ongoing snapshot request");
CODE_PROBE(true,
"Data distributor received a duplicate ongoing snapshot request",
probe::decoration::rare);
TraceEvent("RetryOngoingDistributorSnapRequest").detail("SnapUID", snapUID);
ASSERT(snapReq.snapPayload == ddSnapReqMap[snapUID].snapPayload);
ddSnapReqMap[snapUID] = snapReq;

View File

@ -475,7 +475,7 @@ public:
if (targetTps.present()) {
auto const smoothedTargetTps = stats.updateAndGetTargetLimit(targetTps.get());
te.detail("SmoothedTargetTps", smoothedTargetTps).detail("NumProxies", numProxies);
result[tag] = smoothedTargetTps / numProxies;
result[tag] = std::max(1.0, smoothedTargetTps / numProxies);
} else {
te.disable();
}

View File

@ -49,7 +49,7 @@ bool GrvProxyTagThrottler::TagQueue::isMaxThrottled(double maxThrottleDuration)
}
void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBandsMap) {
CODE_PROBE(true, "GrvProxyTagThrottler rejecting requests");
CODE_PROBE(true, "GrvProxyTagThrottler rejecting requests", probe::decoration::rare);
while (!requests.empty()) {
auto& delayedReq = requests.front();
delayedReq.updateProxyTagThrottledDuration(latencyBandsMap);
@ -58,6 +58,14 @@ void GrvProxyTagThrottler::TagQueue::rejectRequests(LatencyBandsMap& latencyBand
}
}
void GrvProxyTagThrottler::TagQueue::endReleaseWindow(int64_t numStarted, double elapsed) {
if (rateInfo.present()) {
CODE_PROBE(requests.empty(), "Tag queue ending release window with empty request queue");
CODE_PROBE(!requests.empty(), "Tag queue ending release window with requests still queued");
rateInfo.get().endReleaseWindow(numStarted, requests.empty(), elapsed);
}
}
GrvProxyTagThrottler::GrvProxyTagThrottler(double maxThrottleDuration)
: maxThrottleDuration(maxThrottleDuration),
latencyBandsMap("GrvProxyTagThrottler",
@ -202,16 +210,14 @@ void GrvProxyTagThrottler::releaseTransactions(double elapsed,
}
}
// End release windows for queues with valid rateInfo
// End release windows for all tag queues
{
TransactionTagMap<uint32_t> transactionsReleasedMap;
for (const auto& [tag, count] : transactionsReleased) {
transactionsReleasedMap[tag] = count;
}
for (auto& [tag, queue] : queues) {
if (queue.rateInfo.present()) {
queue.rateInfo.get().endReleaseWindow(transactionsReleasedMap[tag], false, elapsed);
}
queue.endReleaseWindow(transactionsReleasedMap[tag], elapsed);
}
}
// If the capacity is increased, that means the vector has been illegally resized, potentially
@ -438,3 +444,33 @@ TEST_CASE("/GrvProxyTagThrottler/Fifo") {
wait(mockFifoClient(&throttler));
return Void();
}
// Tests that while throughput is low, the tag throttler
// does not accumulate too much budget.
//
// A server is setup to server 10 transactions per second,
// then runs idly for 60 seconds. Then a client starts
// and attempts 20 transactions per second for 60 seconds.
// The server throttles the client to only achieve
// 10 transactions per second during this 60 second window.
// If the throttler is allowed to accumulate budget indefinitely
// during the idle 60 seconds, this test will fail.
TEST_CASE("/GrvProxyTagThrottler/LimitedIdleBudget") {
state GrvProxyTagThrottler throttler(5.0);
state TagSet tagSet;
state TransactionTagMap<uint32_t> counters;
{
TransactionTagMap<double> rates;
rates["sampleTag"_sr] = 10.0;
throttler.updateRates(rates);
}
tagSet.addTag("sampleTag"_sr);
state Future<Void> server = mockServer(&throttler);
wait(delay(60.0));
state Future<Void> client = mockClient(&throttler, TransactionPriority::DEFAULT, tagSet, 1, 20.0, &counters);
wait(timeout(client && server, 60.0, Void()));
TraceEvent("TagQuotaTest_LimitedIdleBudget").detail("Counter", counters["sampleTag"_sr]);
ASSERT(isNear(counters["sampleTag"_sr], 60.0 * 10.0));
return Void();
}

View File

@ -35,7 +35,7 @@ bool GrvTransactionRateInfo::canStart(int64_t numAlreadyStarted, int64_t count)
std::min(limit + budget, SERVER_KNOBS->START_TRANSACTION_MAX_TRANSACTIONS_TO_START);
}
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed) {
void GrvTransactionRateInfo::endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed) {
// Update the budget to accumulate any extra capacity available or remove any excess that was used.
// The actual delta is the portion of the limit we didn't use multiplied by the fraction of the rate window that
// elapsed.
@ -52,16 +52,15 @@ void GrvTransactionRateInfo::endReleaseWindow(int64_t numStartedAtPriority, bool
//
// Note that "rate window" here indicates a period of SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW seconds,
// whereas "release window" is the period between wait statements, with duration indicated by "elapsed."
budget =
std::max(0.0, budget + elapsed * (limit - numStartedAtPriority) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
budget = std::max(0.0, budget + elapsed * (limit - numStarted) / SERVER_KNOBS->START_TRANSACTION_RATE_WINDOW);
// If we are emptying out the queue of requests, then we don't need to carry much budget forward
// If we did keep accumulating budget, then our responsiveness to changes in workflow could be compromised
if (queueEmptyAtPriority) {
if (queueEmpty) {
budget = std::min(budget, SERVER_KNOBS->START_TRANSACTION_MAX_EMPTY_QUEUE_BUDGET);
}
smoothReleased.addDelta(numStartedAtPriority);
smoothReleased.addDelta(numStarted);
}
void GrvTransactionRateInfo::disable() {

View File

@ -740,7 +740,8 @@ private:
}
CODE_PROBE(self->enableEncryption && self->uncommittedBytes() > 0,
"KeyValueStoreMemory recovered partial transaction while encryption-at-rest is enabled");
"KeyValueStoreMemory recovered partial transaction while encryption-at-rest is enabled",
probe::decoration::rare);
self->semiCommit();
return Void();

View File

@ -180,7 +180,6 @@ bool MockStorageServer::allShardStatusEqual(const KeyRangeRef& range, MockShardS
bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::set<MockShardStatus>& status) {
auto ranges = serverKeys.intersectingRanges(range);
TraceEvent("AllShardStatusIn", id).detail("RangesEmpty", ranges.empty()).detail("Range", range);
ASSERT(!ranges.empty()); // at least the range is allKeys
for (auto it = ranges.begin(); it != ranges.end(); ++it) {
@ -192,7 +191,6 @@ bool MockStorageServer::allShardStatusIn(const KeyRangeRef& range, const std::se
void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus status, bool restrictSize) {
auto ranges = serverKeys.intersectingRanges(range);
TraceEvent("SetShardStatus", id).detail("KeyRange", range).detail("Status", status);
if (ranges.empty()) {
CODE_PROBE(true, "new shard is adding to server");
@ -202,15 +200,15 @@ void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus
// change the old status
if (ranges.begin().begin() < range.begin && ranges.begin().end() > range.end) {
CODE_PROBE(true, "Implicitly split single shard to 3 pieces");
CODE_PROBE(true, "Implicitly split single shard to 3 pieces", probe::decoration::rare);
threeWayShardSplitting(ranges.begin().range(), range, ranges.begin().cvalue().shardSize, restrictSize);
} else {
if (ranges.begin().begin() < range.begin) {
CODE_PROBE(true, "Implicitly split begin range to 2 pieces");
CODE_PROBE(true, "Implicitly split begin range to 2 pieces", probe::decoration::rare);
twoWayShardSplitting(ranges.begin().range(), range.begin, ranges.begin().cvalue().shardSize, restrictSize);
}
if (ranges.end().begin() > range.end) {
CODE_PROBE(true, "Implicitly split end range to 2 pieces");
CODE_PROBE(true, "Implicitly split end range to 2 pieces", probe::decoration::rare);
auto lastRange = ranges.end();
--lastRange;
twoWayShardSplitting(lastRange.range(), range.end, ranges.end().cvalue().shardSize, restrictSize);
@ -230,7 +228,7 @@ void MockStorageServer::setShardStatus(const KeyRangeRef& range, MockShardStatus
it.value() = ShardInfo{ status, newSize };
} else if ((oldStatus == MockShardStatus::COMPLETED || oldStatus == MockShardStatus::FETCHED) &&
(status == MockShardStatus::INFLIGHT || status == MockShardStatus::FETCHED)) {
CODE_PROBE(true, "Shard already on server");
CODE_PROBE(true, "Shard already on server", probe::decoration::rare);
} else {
TraceEvent(SevError, "MockShardStatusTransitionError", id)
.detail("From", oldStatus)
@ -620,7 +618,7 @@ Future<std::vector<KeyRangeLocationInfo>> MockGlobalState::getKeyRangeLocations(
ASSERT_EQ(srcTeam.size(), 1);
rep.results.emplace_back(it->range(), extractStorageServerInterfaces(srcTeam.front().servers));
}
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited", probe::decoration::rare);
CODE_PROBE(it != ranges.end(), "getKeyRangeLocations is limited");
std::vector<KeyRangeLocationInfo> results;
for (int shard = 0; shard < rep.results.size(); shard++) {

View File

@ -196,7 +196,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -210,7 +210,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}

View File

@ -170,7 +170,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -186,7 +186,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}

View File

@ -289,11 +289,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
// Detect conflicts
double expire = now() + SERVER_KNOBS->SAMPLE_EXPIRATION_TIME;
ConflictBatch conflictBatch(self->conflictSet, &reply.conflictingKeyRangeMap, &reply.arena);
Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
if (g_network->isSimulated() && g_simulator->speedUpSimulation) {
newOldestVersion = req.version - std::max(5 * SERVER_KNOBS->VERSIONS_PER_SECOND,
SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS);
}
const Version newOldestVersion = req.version - SERVER_KNOBS->MAX_WRITE_TRANSACTION_LIFE_VERSIONS;
for (int t = 0; t < req.transactions.size(); t++) {
conflictBatch.addTransaction(req.transactions[t], newOldestVersion);
self->resolvedReadConflictRanges += req.transactions[t].read_conflict_ranges.size();
@ -372,7 +368,7 @@ ACTOR Future<Void> resolveBatch(Reference<Resolver> self,
isEncryptionOpSupported(EncryptOperationType::TLOG_ENCRYPTION) ? &cipherKeys
: nullptr);
}
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery");
CODE_PROBE(self->forceRecovery, "Resolver detects forced recovery", probe::decoration::rare);
}
self->resolvedStateTransactions += req.txnStateTransactions.size();

View File

@ -172,7 +172,7 @@ private:
Standalone<StringRef> e = wait(self->queue->readNext(payloadSize + 1));
if (e.size() != payloadSize + 1) {
CODE_PROBE(true, "Zero fill within payload");
CODE_PROBE(true, "Zero fill within payload", probe::decoration::rare);
zeroFillSize = payloadSize + 1 - e.size();
break;
}
@ -188,7 +188,7 @@ private:
}
}
if (zeroFillSize) {
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue");
CODE_PROBE(true, "Fixing a partial commit at the end of the tlog queue", probe::decoration::rare);
for (int i = 0; i < zeroFillSize; i++)
self->queue->push(StringRef((const uint8_t*)"", 1));
}
@ -1262,7 +1262,7 @@ ACTOR Future<Void> processPopRequests(TLogData* self, Reference<LogData> logData
TraceEvent("PlayIgnoredPop", logData->logId).detail("Tag", tag.toString()).detail("Version", version);
ignoredPops.push_back(tLogPopCore(self, tag, version, logData));
if (++ignoredPopsPlayed % SERVER_KNOBS->TLOG_POP_BATCH_SIZE == 0) {
CODE_PROBE(true, "Yielding while processing pop requests");
CODE_PROBE(true, "Yielding while processing pop requests", probe::decoration::rare);
wait(yield());
}
}
@ -1857,7 +1857,8 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
CODE_PROBE(
true, "tlog peek second attempt ended at a different version", probe::decoration::rare);
replyPromise.sendError(operation_obsolete());
return Void();
}

View File

@ -127,25 +127,38 @@ public:
loop {
state double fetchStartTime = now();
state std::vector<TenantName> tenants = tenantCache->getTenantList();
state std::vector<TenantGroupName> groups;
for (const auto& [group, storage] : tenantCache->tenantStorageMap) {
groups.push_back(group);
}
state int i;
for (i = 0; i < tenants.size(); i++) {
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenants[i]);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
tenantCache->tenantStorageMap[tenants[i]].usage = size;
break;
} catch (Error& e) {
if (e.code() == error_code_tenant_not_found) {
tenantCache->tenantStorageMap.erase(tenants[i]);
for (i = 0; i < groups.size(); i++) {
state TenantGroupName group = groups[i];
state int64_t usage = 0;
// `tenants` needs to be a copy so that the erase (below) or inserts/erases from other
// functions (when this actor yields) do not interfere with the iteration
state std::unordered_set<TenantName> tenants = tenantCache->tenantStorageMap[group].tenants;
state std::unordered_set<TenantName>::iterator iter = tenants.begin();
for (; iter != tenants.end(); iter++) {
state TenantName tenant = *iter;
state ReadYourWritesTransaction tr(tenantCache->dbcx(), tenant);
loop {
try {
state int64_t size = wait(tr.getEstimatedRangeSizeBytes(normalKeys));
usage += size;
break;
} else {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
} catch (Error& e) {
if (e.code() == error_code_tenant_not_found) {
tenantCache->tenantStorageMap[group].tenants.erase(tenant);
break;
} else {
TraceEvent("TenantCacheGetStorageUsageError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
}
}
}
tenantCache->tenantStorageMap[group].usage = usage;
}
lastTenantListFetchTime = now();
@ -162,22 +175,24 @@ public:
state Transaction tr(tenantCache->dbcx());
loop {
loop {
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
for (auto const kv : currentQuotas) {
TenantName const tenant = kv.key.removePrefix(storageQuotaPrefix);
int64_t const quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[tenant].quota = quota;
}
tr.reset();
break;
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
try {
state RangeResult currentQuotas = wait(tr.getRange(storageQuotaKeys, CLIENT_KNOBS->TOO_MANY));
// Reset the quota for all groups; this essentially sets the quota to `max` for groups where the
// quota might have been cleared (i.e., groups that will not be returned in `getRange` request above).
for (auto& [group, storage] : tenantCache->tenantStorageMap) {
storage.quota = std::numeric_limits<int64_t>::max();
}
for (const auto kv : currentQuotas) {
const TenantGroupName group = kv.key.removePrefix(storageQuotaPrefix);
const int64_t quota = BinaryReader::fromStringRef<int64_t>(kv.value, Unversioned());
tenantCache->tenantStorageMap[group].quota = quota;
}
tr.reset();
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
} catch (Error& e) {
TraceEvent("TenantCacheGetStorageQuotaError", tenantCache->id()).error(e);
wait(tr.onError(e));
}
wait(delay(SERVER_KNOBS->TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL));
}
}
};
@ -189,6 +204,10 @@ void TenantCache::insert(TenantName& tenantName, TenantMapEntry& tenant) {
TenantInfo tenantInfo(tenantName, Optional<Standalone<StringRef>>(), tenant.id);
tenantCache[tenantPrefix] = makeReference<TCTenantInfo>(tenantInfo, tenant.prefix);
tenantCache[tenantPrefix]->updateCacheGeneration(generation);
if (tenant.tenantGroup.present()) {
tenantStorageMap[tenant.tenantGroup.get()].tenants.insert(tenantName);
}
}
void TenantCache::startRefresh() {
@ -289,13 +308,13 @@ Optional<Reference<TCTenantInfo>> TenantCache::tenantOwning(KeyRef key) const {
}
std::unordered_set<TenantName> TenantCache::getTenantsOverQuota() const {
std::unordered_set<TenantName> tenants;
for (const auto& [tenant, storage] : tenantStorageMap) {
std::unordered_set<TenantName> tenantsOverQuota;
for (const auto& [tenantGroup, storage] : tenantStorageMap) {
if (storage.usage > storage.quota) {
tenants.insert(tenant);
tenantsOverQuota.insert(storage.tenants.begin(), storage.tenants.end());
}
}
return tenants;
return tenantsOverQuota;
}
Future<Void> TenantCache::monitorTenantMap() {

View File

@ -2025,7 +2025,8 @@ public:
bool memoryOnly,
Reference<IPageEncryptionKeyProvider> keyProvider,
Promise<Void> errorPromise = {})
: keyProvider(keyProvider), ioLock(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_PRIORITY_LAUNCHS),
: keyProvider(keyProvider),
ioLock(makeReference<PriorityMultiLock>(FLOW_KNOBS->MAX_OUTSTANDING, SERVER_KNOBS->REDWOOD_IO_PRIORITIES)),
pageCacheBytes(pageCacheSizeBytes), desiredPageSize(desiredPageSize), desiredExtentSize(desiredExtentSize),
filename(filename), memoryOnly(memoryOnly), errorPromise(errorPromise),
remapCleanupWindowBytes(remapCleanupWindowBytes), concurrentExtentReads(new FlowLock(concurrentExtentReads)) {
@ -2037,7 +2038,7 @@ public:
// This sets the page cache size for all PageCacheT instances using the same evictor
pageCache.evictor().sizeLimit = pageCacheBytes;
g_redwoodMetrics.ioLock = &ioLock;
g_redwoodMetrics.ioLock = ioLock.getPtr();
if (!g_redwoodMetricsActor.isValid()) {
g_redwoodMetricsActor = redwoodMetricsLogger();
}
@ -2499,7 +2500,7 @@ public:
unsigned int level,
bool header) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(header ? ioMaxPriority : ioMinPriority));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(header ? ioMaxPriority : ioMinPriority));
++g_redwoodMetrics.metric.pagerDiskWrite;
g_redwoodMetrics.level(level).metrics.events.addEventReason(PagerEvents::PageWrite, reason);
if (self->memoryOnly) {
@ -2779,7 +2780,7 @@ public:
int blockSize,
int64_t offset,
int priority) {
state PriorityMultiLock::Lock lock = wait(self->ioLock.lock(std::min(priority, ioMaxPriority)));
state PriorityMultiLock::Lock lock = wait(self->ioLock->lock(std::min(priority, ioMaxPriority)));
++g_redwoodMetrics.metric.pagerDiskRead;
int bytes = wait(self->pageFile->read(pageBuffer->rawData() + pageOffset, blockSize, offset));
return bytes;
@ -3593,7 +3594,7 @@ public:
// The next section explicitly cancels all pending operations held in the pager
debug_printf("DWALPager(%s) shutdown kill ioLock\n", self->filename.c_str());
self->ioLock.kill();
self->ioLock->kill();
debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str());
self->recoverFuture.cancel();
@ -3802,7 +3803,7 @@ private:
Reference<IPageEncryptionKeyProvider> keyProvider;
PriorityMultiLock ioLock;
Reference<PriorityMultiLock> ioLock;
int64_t pageCacheBytes;
@ -8894,32 +8895,25 @@ void RedwoodMetrics::getIOLockFields(TraceEvent* e, std::string* s) {
int maxPriority = ioLock->maxPriority();
if (e != nullptr) {
e->detail("ActiveReads", ioLock->totalRunners());
e->detail("AwaitReads", ioLock->totalWaiters());
e->detail("IOActiveTotal", ioLock->getRunnersCount());
e->detail("IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
e->detail(format("ActiveP%d", priority), ioLock->numRunners(priority));
e->detail(format("AwaitP%d", priority), ioLock->numWaiters(priority));
e->detail(format("IOActiveP%d", priority), ioLock->getRunnersCount(priority));
e->detail(format("IOWaitingP%d", priority), ioLock->getWaitersCount(priority));
}
}
if (s != nullptr) {
std::string active = "Active";
std::string await = "Await";
*s += "\n";
*s += format("%-15s %-8u ", "ActiveReads", ioLock->totalRunners());
*s += format("%-15s %-8u ", "AwaitReads", ioLock->totalWaiters());
*s += "\n";
*s += format("%-15s %-8u ", "IOActiveTotal", ioLock->getRunnersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (active + 'P' + std::to_string(priority)).c_str(), ioLock->numRunners(priority));
*s += format("IOActiveP%-6d %-8u ", priority, ioLock->getRunnersCount(priority));
}
*s += "\n";
*s += format("%-15s %-8u ", "IOWaitingTotal", ioLock->getWaitersCount());
for (int priority = 0; priority <= maxPriority; ++priority) {
*s +=
format("%-15s %-8u ", (await + 'P' + std::to_string(priority)).c_str(), ioLock->numWaiters(priority));
*s += format("IOWaitingP%-5d %-8u ", priority, ioLock->getWaitersCount(priority));
}
}
}
@ -11407,57 +11401,3 @@ TEST_CASE(":/redwood/performance/histograms") {
return Void();
}
ACTOR Future<Void> waitLockIncrement(PriorityMultiLock* pml, int priority, int* pout) {
state PriorityMultiLock::Lock lock = wait(pml->lock(priority));
wait(delay(deterministicRandom()->random01() * .1));
++*pout;
return Void();
}
TEST_CASE("/redwood/PriorityMultiLock") {
state std::vector<int> priorities = { 10, 20, 40 };
state int concurrency = 25;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
// Clog the lock buy taking concurrency locks at each level
state std::vector<Future<PriorityMultiLock::Lock>> lockFutures;
for (int i = 0; i < priorities.size(); ++i) {
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(i));
}
}
// Wait for n = concurrency locks to be acquired
wait(quorum(lockFutures, concurrency));
state std::vector<Future<Void>> futures;
for (int i = 0; i < 10e3; ++i) {
int p = i % priorities.size();
futures.push_back(waitLockIncrement(pml, p, &counts[p]));
}
state Future<Void> f = waitForAll(futures);
// Release the locks
lockFutures.clear();
// Print stats and wait for all futures to be ready
loop {
choose {
when(wait(delay(1))) {
printf("counts: ");
for (auto c : counts) {
printf("%d ", c);
}
printf(" pml: %s\n", pml->toString().c_str());
}
when(wait(f)) { break; }
}
}
delete pml;
return Void();
}

View File

@ -162,10 +162,7 @@ ACTOR Future<Void> loadManifest(Database db, Reference<BlobConnectionProvider> b
ACTOR Future<Void> printRestoreSummary(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<BlobGranuleRestoreVersionVector> listBlobGranules(Database db, Reference<BlobConnectionProvider> blobConn);
ACTOR Future<int64_t> lastBlobEpoc(Database db, Reference<BlobConnectionProvider> blobConn);
inline bool isFullRestoreMode() {
return SERVER_KNOBS->BLOB_FULL_RESTORE_MODE;
};
ACTOR Future<bool> isFullRestoreMode(Database db, KeyRangeRef range);
#include "flow/unactorcompiler.h"

View File

@ -30,6 +30,7 @@
struct BlobMigratorInterface {
constexpr static FileIdentifier file_identifier = 869199;
RequestStream<struct HaltBlobMigratorRequest> haltBlobMigrator;
RequestStream<ReplyPromise<Void>> waitFailure;
LocalityData locality;
UID uniqueID;
StorageServerInterface ssi;
@ -48,7 +49,7 @@ struct BlobMigratorInterface {
template <class Archive>
void serialize(Archive& ar) {
serializer(ar, locality, uniqueID, haltBlobMigrator);
serializer(ar, locality, uniqueID, haltBlobMigrator, waitFailure);
}
};

View File

@ -144,6 +144,7 @@ public:
Future<Void> clientCounter;
int clientCount;
AsyncVar<bool> blobGranulesEnabled;
AsyncVar<bool> blobRestoreEnabled;
ClusterType clusterType = ClusterType::STANDALONE;
Optional<ClusterName> metaclusterName;
Optional<MetaclusterRegistrationEntry> metaclusterRegistration;
@ -159,7 +160,7 @@ public:
TaskPriority::DefaultEndpoint,
LockAware::True)), // SOMEDAY: Locality!
unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientCount(0),
blobGranulesEnabled(config.blobGranulesEnabled) {
blobGranulesEnabled(config.blobGranulesEnabled), blobRestoreEnabled(false) {
clientCounter = countClients(this);
}

View File

@ -60,6 +60,7 @@ class GrvProxyTagThrottler {
void setRate(double rate);
bool isMaxThrottled(double maxThrottleDuration) const;
void rejectRequests(LatencyBandsMap&);
void endReleaseWindow(int64_t numStarted, double elapsed);
};
// Track the budgets for each tag

View File

@ -55,7 +55,7 @@ public:
// Updates the budget to accumulate any extra capacity available or remove any excess that was used.
// Call at the end of a release window.
void endReleaseWindow(int64_t numStartedAtPriority, bool queueEmptyAtPriority, double elapsed);
void endReleaseWindow(int64_t numStarted, bool queueEmpty, double elapsed);
// Smoothly sets rate. If currently disabled, reenable
void setRate(double rate);

View File

@ -208,7 +208,7 @@ class Ratekeeper {
Deque<std::pair<double, Version>> blobWorkerVersionHistory;
Optional<Key> remoteDC;
double getRecoveryDuration(Version ver) {
double getRecoveryDuration(Version ver) const {
auto it = version_recovery.lower_bound(ver);
double recoveryDuration = 0;
while (it != version_recovery.end()) {

View File

@ -192,7 +192,7 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
choose {
when(state WaitMetricsRequest req = waitNext(ssi.waitMetrics.getFuture())) {
if (!req.tenantInfo.present() && !self->isReadable(req.keys)) {
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()");
CODE_PROBE(true, "waitMetrics immediate wrong_shard_server()", probe::decoration::rare);
self->sendErrorWithPenalty(req.reply, wrong_shard_server(), self->getPenalty());
} else {
self->addActor(self->waitMetricsTenantAware(req));
@ -233,4 +233,4 @@ Future<Void> serveStorageMetricsRequests(ServiceType* self, StorageServerInterfa
}
}
#include "flow/unactorcompiler.h"
#endif // FDBSERVER_STORAGEMETRICS_H
#endif // FDBSERVER_STORAGEMETRICS_H

View File

@ -35,8 +35,9 @@ typedef Map<KeyRef, Reference<TCTenantInfo>> TenantMapByPrefix;
struct Storage {
int64_t quota = std::numeric_limits<int64_t>::max();
int64_t usage = 0;
std::unordered_set<TenantName> tenants;
};
typedef std::unordered_map<TenantName, Storage> TenantStorageMap;
typedef std::unordered_map<TenantGroupName, Storage> TenantStorageMap;
struct TenantCacheTenantCreated {
KeyRange keys;
@ -56,7 +57,8 @@ private:
uint64_t generation;
TenantMapByPrefix tenantCache;
// Map from tenant names to storage quota and usage
// Map from tenant group names to the list of tenants, cumumlative storage used by
// all the tenants in the group, and its storage quota.
TenantStorageMap tenantStorageMap;
// mark the start of a new sweep of the tenant cache

View File

@ -566,6 +566,8 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
// back, we can avoid notifying other SS of change feeds that don't durably exist
Version metadataCreateVersion = invalidVersion;
FlowLock fetchLock = FlowLock(1);
bool removing = false;
bool destroyed = false;
@ -1109,15 +1111,13 @@ public:
FlowLock serveFetchCheckpointParallelismLock;
PriorityMultiLock ssLock;
Reference<PriorityMultiLock> ssLock;
std::vector<int> readPriorityRanks;
Future<PriorityMultiLock::Lock> getReadLock(const Optional<ReadOptions>& options) {
// TODO: Fix perf regression in 100% cache read case where taking this lock adds too much overhead
return PriorityMultiLock::Lock();
// int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
// readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
// return ssLock.lock(readPriorityRanks[readType]);
int readType = (int)(options.present() ? options.get().type : ReadType::NORMAL);
readType = std::clamp<int>(readType, 0, readPriorityRanks.size() - 1);
return ssLock->lock(readPriorityRanks[readType]);
}
FlowLock serveAuditStorageParallelismLock;
@ -1406,7 +1406,8 @@ public:
fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL),
fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false),
serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM),
ssLock(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY, SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES),
ssLock(makeReference<PriorityMultiLock>(SERVER_KNOBS->STORAGE_SERVER_READ_CONCURRENCY,
SERVER_KNOBS->STORAGESERVER_READ_PRIORITIES)),
serveAuditStorageParallelismLock(SERVER_KNOBS->SERVE_AUDIT_STORAGE_PARALLELISM),
instanceID(deterministicRandom()->randomUniqueID().first()), shuttingDown(false), behind(false),
versionBehind(false), debug_inApplyUpdate(false), debug_lastValidateTime(0), lastBytesInputEBrake(0),
@ -1414,7 +1415,7 @@ public:
busiestWriteTagContext(ssi.id()), counters(this),
storageServerSourceTLogIDEventHolder(
makeReference<EventCacheHolder>(ssi.id().toString() + "/StorageServerSourceTLogID")) {
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READ_RANKS, ',');
readPriorityRanks = parseStringToVector<int>(SERVER_KNOBS->STORAGESERVER_READTYPE_PRIORITY_MAP, ',');
ASSERT(readPriorityRanks.size() > (int)ReadType::MAX);
version.initMetric("StorageServer.Version"_sr, counters.cc.getId());
oldestVersion.initMetric("StorageServer.OldestVersion"_sr, counters.cc.getId());
@ -4765,7 +4766,6 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
Arena* pArena,
int matchIndex,
bool isRangeQuery,
bool isBoundary,
KeyValueRef* it,
MappedKeyValueRef* kvm,
Key mappedKey) {
@ -4773,31 +4773,42 @@ ACTOR Future<Void> mapSubquery(StorageServer* data,
// Use the mappedKey as the prefix of the range query.
GetRangeReqAndResultRef getRange = wait(quickGetKeyValues(data, mappedKey, version, pArena, pOriginalReq));
if ((!getRange.result.empty() && matchIndex == MATCH_INDEX_MATCHED_ONLY) ||
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY)) {
(getRange.result.empty() && matchIndex == MATCH_INDEX_UNMATCHED_ONLY) || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key;
kvm->value = it->value;
}
kvm->boundaryAndExist = isBoundary && !getRange.result.empty();
kvm->reqAndResult = getRange;
} else {
GetValueReqAndResultRef getValue = wait(quickGetValue(data, mappedKey, version, pArena, pOriginalReq));
kvm->reqAndResult = getValue;
kvm->boundaryAndExist = isBoundary && getValue.result.present();
}
return Void();
}
int getMappedKeyValueSize(MappedKeyValueRef mappedKeyValue) {
auto& reqAndResult = mappedKeyValue.reqAndResult;
int bytes = 0;
if (std::holds_alternative<GetValueReqAndResultRef>(reqAndResult)) {
const auto& getValue = std::get<GetValueReqAndResultRef>(reqAndResult);
bytes = getValue.expectedSize();
} else if (std::holds_alternative<GetRangeReqAndResultRef>(reqAndResult)) {
const auto& getRange = std::get<GetRangeReqAndResultRef>(reqAndResult);
bytes = getRange.result.expectedSize();
} else {
throw internal_error();
}
return bytes;
}
ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
GetKeyValuesReply input,
StringRef mapper,
// To provide span context, tags, debug ID to underlying lookups.
GetMappedKeyValuesRequest* pOriginalReq,
Optional<Key> tenantPrefix,
int matchIndex) {
int matchIndex,
int* remainingLimitBytes) {
state GetMappedKeyValuesReply result;
result.version = input.version;
result.more = input.more;
result.cached = input.cached;
result.arena.dependsOn(input.arena);
@ -4826,22 +4837,15 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(),
"storageserver.mapKeyValues.BeforeLoop");
for (; offset < sz; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
for (; offset<sz&& * remainingLimitBytes> 0; offset += SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE) {
// Divide into batches of MAX_PARALLEL_QUICK_GET_VALUE subqueries
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
KeyValueRef* it = &input.data[i + offset];
MappedKeyValueRef* kvm = &kvms[i];
bool isBoundary = (i + offset) == 0 || (i + offset) == sz - 1;
// need to keep the boundary, so that caller can use it as a continuation.
if (isBoundary || matchIndex == MATCH_INDEX_ALL) {
kvm->key = it->key;
kvm->value = it->value;
} else {
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
}
// Clear key value to the default.
kvm->key = ""_sr;
kvm->value = ""_sr;
Key mappedKey = constructMappedKey(it, vt, mappedKeyFormatTuple);
// Make sure the mappedKey is always available, so that it's good even we want to get key asynchronously.
result.arena.dependsOn(mappedKey.arena());
@ -4849,16 +4853,8 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
// std::cout << "key:" << printable(kvm->key) << ", value:" << printable(kvm->value)
// << ", mappedKey:" << printable(mappedKey) << std::endl;
subqueries.push_back(mapSubquery(data,
input.version,
pOriginalReq,
&result.arena,
matchIndex,
isRangeQuery,
isBoundary,
it,
kvm,
mappedKey));
subqueries.push_back(mapSubquery(
data, input.version, pOriginalReq, &result.arena, matchIndex, isRangeQuery, it, kvm, mappedKey));
}
wait(waitForAll(subqueries));
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
@ -4867,9 +4863,31 @@ ACTOR Future<GetMappedKeyValuesReply> mapKeyValues(StorageServer* data,
"storageserver.mapKeyValues.AfterBatch");
subqueries.clear();
for (int i = 0; i + offset < sz && i < SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE; i++) {
// since we always read the index, so always consider the index size
int indexSize = sizeof(KeyValueRef) + input.data[i + offset].expectedSize();
int size = indexSize + getMappedKeyValueSize(kvms[i]);
*remainingLimitBytes -= size;
result.data.push_back(result.arena, kvms[i]);
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT && *remainingLimitBytes <= 0) {
break;
}
}
}
int resultSize = result.data.size();
if (resultSize > 0) {
// keep index for boundary index entries, so that caller can use it as a continuation.
result.data[0].key = input.data[0].key;
result.data[0].value = input.data[0].value;
result.data[0].boundaryAndExist = getMappedKeyValueSize(kvms[0]) > 0;
result.data.back().key = input.data[resultSize - 1].key;
result.data.back().value = input.data[resultSize - 1].value;
// index needs to be -1
int index = (resultSize - 1) % SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
result.data.back().boundaryAndExist = getMappedKeyValueSize(kvms[index]) > 0;
}
result.more = input.more || resultSize < sz;
if (pOriginalReq->options.present() && pOriginalReq->options.get().debugID.present())
g_traceBatch.addEvent("TransactionDebug",
pOriginalReq->options.get().debugID.get().first(),
@ -5124,12 +5142,15 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
req.reply.send(none);
} else {
state int remainingLimitBytes = req.limitBytes;
// create a temporary byte limit for index fetching ONLY, this should be excessive
// because readRange is cheap when reading additional bytes
state int bytesForIndex =
std::min(req.limitBytes, (int)(req.limitBytes * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH));
GetKeyValuesReply getKeyValuesReply = wait(readRange(data,
version,
KeyRangeRef(begin, end),
req.limit,
&remainingLimitBytes,
&bytesForIndex,
span.context,
req.options,
tenantPrefix));
@ -5143,9 +5164,10 @@ ACTOR Future<Void> getMappedKeyValuesQ(StorageServer* data, GetMappedKeyValuesRe
try {
// Map the scanned range to another list of keys and look up.
GetMappedKeyValuesReply _r =
wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, tenantPrefix, req.matchIndex));
wait(mapKeyValues(data, getKeyValuesReply, req.mapper, &req, req.matchIndex, &remainingLimitBytes));
r = _r;
} catch (Error& e) {
// catch txn_too_old here if prefetch runs for too long, and returns it back to client
TraceEvent("MapError").error(e);
throw;
}
@ -6142,6 +6164,7 @@ ACTOR Future<Standalone<VectorRef<BlobGranuleChunkRef>>> tryReadBlobGranules(Tra
loop {
try {
Standalone<VectorRef<BlobGranuleChunkRef>> chunks = wait(tr->readBlobGranules(keys, 0, readVersion));
TraceEvent(SevDebug, "ReadBlobGranules").detail("Keys", keys).detail("Chunks", chunks.size());
return chunks;
} catch (Error& e) {
if (retryCount >= maxRetryCount) {
@ -6173,10 +6196,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
for (i = 0; i < chunks.size(); ++i) {
state KeyRangeRef chunkRange = chunks[i].keyRange;
state RangeResult rows = wait(readBlobGranule(chunks[i], keys, 0, fetchVersion, blobConn));
TraceEvent("ReadBlobData")
.detail("Rows", rows.size())
.detail("ChunkRange", chunkRange.toString())
.detail("Keys", keys.toString());
TraceEvent(SevDebug, "ReadBlobData").detail("Rows", rows.size()).detail("ChunkRange", chunkRange);
if (rows.size() == 0) {
rows.readThrough = KeyRef(rows.arena(), std::min(chunkRange.end, keys.end));
}
@ -6189,7 +6209,7 @@ ACTOR Future<Void> tryGetRangeFromBlob(PromiseStream<RangeResult> results,
} catch (Error& e) {
TraceEvent(SevWarn, "ReadBlobDataFailure")
.suppressFor(5.0)
.detail("Keys", keys.toString())
.detail("Keys", keys)
.detail("FetchVersion", fetchVersion)
.detail("Error", e.what());
tr->reset();
@ -6290,6 +6310,15 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
Version beginVersion,
Version endVersion,
ReadOptions readOptions) {
state FlowLock::Releaser feedFetchReleaser;
// avoid fetching the same version range of the same change feed multiple times.
choose {
when(wait(changeFeedInfo->fetchLock.take())) {
feedFetchReleaser = FlowLock::Releaser(changeFeedInfo->fetchLock);
}
when(wait(changeFeedInfo->durableFetchVersion.whenAtLeast(endVersion))) { return invalidVersion; }
}
state Version startVersion = beginVersion;
startVersion = std::max(startVersion, emptyVersion + 1);
@ -6309,6 +6338,7 @@ ACTOR Future<Version> fetchChangeFeedApplier(StorageServer* data,
return invalidVersion;
}
// FIXME: if this feed range is not wholly contained within the shard, set cache to true on reading
state Reference<ChangeFeedData> feedResults = makeReference<ChangeFeedData>();
state Future<Void> feed = data->cx->getChangeFeedStream(feedResults,
rangeId,
@ -6824,6 +6854,16 @@ ACTOR Future<std::vector<Key>> fetchChangeFeedMetadata(StorageServer* data,
return feedIds;
}
ReadOptions readOptionsForFeedFetch(const ReadOptions& options, const KeyRangeRef& keys, const KeyRangeRef& feedRange) {
if (!feedRange.contains(keys)) {
return options;
}
// If feed range wholly contains shard range, cache on fetch because other shards will likely also fetch it
ReadOptions newOptions = options;
newOptions.cacheResult = true;
return newOptions;
}
// returns max version fetched for each feed
// newFeedIds is used for the second fetch to get data for new feeds that weren't there for the first fetch
ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer* data,
@ -6848,8 +6888,9 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
auto feedIt = data->uidChangeFeed.find(feedId);
// feed may have been moved away or deleted after move was scheduled, do nothing in that case
if (feedIt != data->uidChangeFeed.end() && !feedIt->second->removing) {
ReadOptions fetchReadOptions = readOptionsForFeedFetch(readOptions, keys, feedIt->second->range);
feedFetches[feedIt->second->id] =
fetchChangeFeed(data, feedIt->second, beginVersion, endVersion, readOptions);
fetchChangeFeed(data, feedIt->second, beginVersion, endVersion, fetchReadOptions);
}
}
for (auto& feedId : newFeedIds) {
@ -6857,7 +6898,8 @@ ACTOR Future<std::unordered_map<Key, Version>> dispatchChangeFeeds(StorageServer
// we just read the change feed data map earlier in fetchKeys without yielding, so these feeds must exist
ASSERT(feedIt != data->uidChangeFeed.end());
ASSERT(!feedIt->second->removing);
feedFetches[feedIt->second->id] = fetchChangeFeed(data, feedIt->second, 0, endVersion, readOptions);
ReadOptions fetchReadOptions = readOptionsForFeedFetch(readOptions, keys, feedIt->second->range);
feedFetches[feedIt->second->id] = fetchChangeFeed(data, feedIt->second, 0, endVersion, fetchReadOptions);
}
loop {
@ -6998,7 +7040,8 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
// We must also ensure we have fetched all change feed metadata BEFORE changing the phase to fetching to ensure
// change feed mutations get applied correctly
state std::vector<Key> changeFeedsToFetch;
if (!isFullRestoreMode()) {
state bool isFullRestore = wait(isFullRestoreMode(data->cx, keys));
if (!isFullRestore) {
std::vector<Key> _cfToFetch = wait(fetchCFMetadata);
changeFeedsToFetch = _cfToFetch;
}
@ -7076,7 +7119,7 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
state PromiseStream<RangeResult> results;
state Future<Void> hold;
if (SERVER_KNOBS->FETCH_USING_BLOB) {
if (isFullRestore) {
hold = tryGetRangeFromBlob(results, &tr, keys, fetchVersion, data->blobConn);
} else {
hold = tryGetRange(results, &tr, keys);
@ -7114,7 +7157,6 @@ ACTOR Future<Void> fetchKeys(StorageServer* data, AddingShard* shard) {
data->thisServerID);
}
}
metricReporter.addFetchedBytes(expectedBlockSize, this_block.size());
// Write this_block to storage
@ -10414,20 +10456,20 @@ ACTOR Future<Void> metricsCore(StorageServer* self, StorageServerInterface ssi)
te.detail("StorageEngine", self->storage.getKeyValueStoreType().toString());
te.detail("Tag", self->tag.toString());
std::vector<int> rpr = self->readPriorityRanks;
te.detail("ReadsActive", self->ssLock.totalRunners());
te.detail("ReadsWaiting", self->ssLock.totalWaiters());
te.detail("ReadsTotalActive", self->ssLock->getRunnersCount());
te.detail("ReadsTotalWaiting", self->ssLock->getWaitersCount());
int type = (int)ReadType::FETCH;
te.detail("ReadFetchActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadFetchActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadFetchWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::LOW;
te.detail("ReadLowActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadLowActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadLowWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::NORMAL;
te.detail("ReadNormalActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadNormalActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadNormalWaiting", self->ssLock->getWaitersCount(rpr[type]));
type = (int)ReadType::HIGH;
te.detail("ReadHighActive", self->ssLock.numRunners(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock.numWaiters(rpr[type]));
te.detail("ReadHighActive", self->ssLock->getRunnersCount(rpr[type]));
te.detail("ReadHighWaiting", self->ssLock->getWaitersCount(rpr[type]));
StorageBytes sb = self->storage.getStorageBytes();
te.detail("KvstoreBytesUsed", sb.used);
te.detail("KvstoreBytesFree", sb.free);
@ -11243,7 +11285,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
// If the storage server dies while something that uses self is still on the stack,
// we want that actor to complete before we terminate and that memory goes out of scope
self.ssLock.kill();
self.ssLock->kill();
state Error err = e;
if (storageServerTerminated(self, persistentData, err)) {
@ -11341,7 +11383,7 @@ ACTOR Future<Void> storageServer(IKeyValueStore* persistentData,
throw internal_error();
} catch (Error& e) {
self.ssLock.kill();
self.ssLock->kill();
if (self.byteSampleRecovery.isValid()) {
self.byteSampleRecovery.cancel();

View File

@ -2331,10 +2331,11 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
recruited.initEndpoints();
if (blobMigratorInterf->get().present()) {
recruited = blobMigratorInterf->get().get();
CODE_PROBE(true, "Recruited while already a blob migrator.");
CODE_PROBE(true, "Recruited while already a blob migrator.", probe::decoration::rare);
} else {
startRole(Role::BLOB_MIGRATOR, recruited.id(), interf.id());
DUMPTOKEN(recruited.haltBlobMigrator);
DUMPTOKEN(recruited.waitFailure);
DUMPTOKEN(recruited.ssi.getValue);
DUMPTOKEN(recruited.ssi.getKey);
DUMPTOKEN(recruited.ssi.getKeyValues);
@ -2345,7 +2346,6 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
DUMPTOKEN(recruited.ssi.getReadHotRanges);
DUMPTOKEN(recruited.ssi.getRangeSplitPoints);
DUMPTOKEN(recruited.ssi.getStorageMetrics);
DUMPTOKEN(recruited.ssi.waitFailure);
DUMPTOKEN(recruited.ssi.getQueuingMetrics);
DUMPTOKEN(recruited.ssi.getKeyValueStoreType);
DUMPTOKEN(recruited.ssi.watchValue);
@ -2796,7 +2796,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
when(state WorkerSnapRequest snapReq = waitNext(interf.workerSnapReq.getFuture())) {
std::string snapReqKey = snapReq.snapUID.toString() + snapReq.role.toString();
if (snapReqResultMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate finished snapshot request");
CODE_PROBE(true, "Worker received a duplicate finished snapshot request", probe::decoration::rare);
auto result = snapReqResultMap[snapReqKey];
result.isError() ? snapReq.reply.sendError(result.getError()) : snapReq.reply.send(result.get());
TraceEvent("RetryFinishedWorkerSnapRequest")
@ -2804,7 +2804,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
.detail("Role", snapReq.role)
.detail("Result", result.isError() ? result.getError().code() : success().code());
} else if (snapReqMap.count(snapReqKey)) {
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request");
CODE_PROBE(true, "Worker received a duplicate ongoing snapshot request", probe::decoration::rare);
TraceEvent("RetryOngoingWorkerSnapRequest")
.detail("SnapUID", snapReq.snapUID.toString())
.detail("Role", snapReq.role);

View File

@ -20,7 +20,9 @@
#include <cstdint>
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
@ -28,9 +30,13 @@
struct CreateTenantWorkload : TestWorkload {
static constexpr auto NAME = "CreateTenant";
TenantName tenant;
Optional<TenantGroupName> tenantGroup;
CreateTenantWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
tenant = getOption(options, "name"_sr, "DefaultTenant"_sr);
if (hasOption(options, "group"_sr)) {
tenantGroup = getOption(options, "group"_sr, "DefaultGroup"_sr);
}
}
Future<Void> setup(Database const& cx) override {
@ -46,7 +52,12 @@ struct CreateTenantWorkload : TestWorkload {
ACTOR static Future<Void> _setup(CreateTenantWorkload* self, Database db) {
try {
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant));
TenantMapEntry givenEntry;
if (self->tenantGroup.present()) {
givenEntry.tenantGroup = self->tenantGroup.get();
givenEntry.encrypted = SERVER_KNOBS->ENABLE_ENCRYPTION;
}
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(db.getReference(), self->tenant, givenEntry));
ASSERT(entry.present());
} catch (Error& e) {
TraceEvent(SevError, "TenantCreationFailed").error(e);

View File

@ -38,6 +38,8 @@ const KeyRef prefix = "prefix"_sr;
const KeyRef RECORD = "RECORD"_sr;
const KeyRef INDEX = "INDEX"_sr;
int recordSize;
int indexSize;
struct GetMappedRangeWorkload : ApiWorkload {
static constexpr auto NAME = "GetMappedRange";
bool enabled;
@ -93,19 +95,32 @@ struct GetMappedRangeWorkload : ApiWorkload {
loop {
std::cout << "start fillInRecords n=" << n << std::endl;
// TODO: When n is large, split into multiple transactions.
recordSize = 0;
indexSize = 0;
try {
for (int i = 0; i < n; i++) {
if (self->SPLIT_RECORDS) {
for (int split = 0; split < SPLIT_SIZE; split++) {
tr.set(recordKey(i, split), recordValue(i, split));
if (i == 0) {
recordSize +=
recordKey(i, split).size() + recordValue(i, split).size() + sizeof(KeyValueRef);
}
}
} else {
tr.set(recordKey(i), recordValue(i));
if (i == 0) {
recordSize += recordKey(i).size() + recordValue(i).size() + sizeof(KeyValueRef);
}
}
tr.set(indexEntryKey(i), EMPTY);
if (i == 0) {
indexSize += indexEntryKey(i).size() + sizeof(KeyValueRef);
}
}
wait(tr.commit());
std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << std::endl;
std::cout << "finished fillInRecords with version " << tr.getCommittedVersion() << " recordSize "
<< recordSize << " indexSize " << indexSize << std::endl;
break;
} catch (Error& e) {
std::cout << "failed fillInRecords, retry" << std::endl;
@ -146,8 +161,9 @@ struct GetMappedRangeWorkload : ApiWorkload {
int matchIndex,
bool isBoundary,
bool allMissing) {
// std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key) << "
// indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId)) << std::endl;
// std::cout << "validateRecord expectedId " << expectedId << " it->key " << printable(it->key)
// << " indexEntryKey(expectedId) " << printable(indexEntryKey(expectedId))
// << " matchIndex: " << matchIndex << std::endl;
if (matchIndex == MATCH_INDEX_ALL || isBoundary) {
ASSERT(it->key == indexEntryKey(expectedId));
} else if (matchIndex == MATCH_INDEX_MATCHED_ONLY) {
@ -163,7 +179,6 @@ struct GetMappedRangeWorkload : ApiWorkload {
ASSERT(std::holds_alternative<GetRangeReqAndResultRef>(it->reqAndResult));
auto& getRange = std::get<GetRangeReqAndResultRef>(it->reqAndResult);
auto& rangeResult = getRange.result;
ASSERT(it->boundaryAndExist == (isBoundary && !rangeResult.empty()));
// std::cout << "rangeResult.size()=" << rangeResult.size() << std::endl;
// In the future, we may be able to do the continuation more efficiently by combining partial results
// together and then validate.
@ -200,6 +215,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
KeySelector endSelector,
Key mapper,
int limit,
int byteLimit,
int expectedBeginId,
GetMappedRangeWorkload* self,
int matchIndex,
@ -207,14 +223,16 @@ struct GetMappedRangeWorkload : ApiWorkload {
std::cout << "start scanMappedRangeWithLimits beginSelector:" << beginSelector.toString()
<< " endSelector:" << endSelector.toString() << " expectedBeginId:" << expectedBeginId
<< " limit:" << limit << std::endl;
<< " limit:" << limit << " byteLimit: " << byteLimit << " recordSize: " << recordSize
<< " STRICTLY_ENFORCE_BYTE_LIMIT: " << SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT << " allMissing "
<< allMissing << std::endl;
loop {
state Reference<TransactionWrapper> tr = self->createTransaction();
try {
MappedRangeResult result = wait(tr->getMappedRange(beginSelector,
endSelector,
mapper,
GetRangeLimits(limit),
GetRangeLimits(limit, byteLimit),
matchIndex,
self->snapshot,
Reverse::False));
@ -270,17 +288,51 @@ struct GetMappedRangeWorkload : ApiWorkload {
Key endTuple = Tuple::makeTuple(prefix, INDEX, indexKey(endId)).getDataAsStandalone();
state KeySelector endSelector = KeySelector(firstGreaterOrEqual(endTuple));
state int limit = 100;
state int byteLimit = deterministicRandom()->randomInt(1, 9) * 10000;
state int expectedBeginId = beginId;
std::cout << "ByteLimit: " << byteLimit << " limit: " << limit
<< " FRACTION_INDEX_BYTELIMIT_PREFETCH: " << SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH
<< " MAX_PARALLEL_QUICK_GET_VALUE: " << SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE << std::endl;
while (true) {
MappedRangeResult result = wait(self->scanMappedRangeWithLimits(
cx, beginSelector, endSelector, mapper, limit, expectedBeginId, self, matchIndex, allMissing));
MappedRangeResult result = wait(self->scanMappedRangeWithLimits(cx,
beginSelector,
endSelector,
mapper,
limit,
byteLimit,
expectedBeginId,
self,
matchIndex,
allMissing));
expectedBeginId += result.size();
if (result.more) {
if (result.empty()) {
// This is usually not expected.
std::cout << "not result but have more, try again" << std::endl;
} else {
// auto& reqAndResult = std::get<GetRangeReqAndResultRef>(result.back().reqAndResult);
int size = allMissing ? indexSize : (indexSize + recordSize);
int expectedCnt = limit;
int indexByteLimit = byteLimit * SERVER_KNOBS->FRACTION_INDEX_BYTELIMIT_PREFETCH;
int indexCountByteLimit = indexByteLimit / indexSize + (indexByteLimit % indexSize != 0);
int indexCount = std::min(limit, indexCountByteLimit);
std::cout << "indexCount: " << indexCount << std::endl;
// result set cannot be larger than the number of index fetched
ASSERT(result.size() <= indexCount);
expectedCnt = std::min(expectedCnt, indexCount);
int boundByRecord;
if (SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT) {
// might have 1 additional entry over the limit
boundByRecord = byteLimit / size + (byteLimit % size != 0);
} else {
// might have 1 additional batch over the limit
int roundSize = size * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
int round = byteLimit / roundSize + (byteLimit % roundSize != 0);
boundByRecord = round * SERVER_KNOBS->MAX_PARALLEL_QUICK_GET_VALUE;
}
expectedCnt = std::min(expectedCnt, boundByRecord);
std::cout << "boundByRecord: " << boundByRecord << std::endl;
ASSERT(result.size() == expectedCnt);
beginSelector = KeySelector(firstGreaterThan(result.back().key));
}
} else {
@ -289,6 +341,7 @@ struct GetMappedRangeWorkload : ApiWorkload {
}
}
ASSERT(expectedBeginId == endId);
return Void();
}
@ -433,6 +486,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
} else if (r < 0.75) {
matchIndex = MATCH_INDEX_UNMATCHED_ONLY;
}
state bool originalStrictlyEnforeByteLimit = SERVER_KNOBS->STRICTLY_ENFORCE_BYTE_LIMIT;
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = deterministicRandom()->coinflip();
wait(self->scanMappedRange(cx, 10, 490, mapper, self, matchIndex));
{
@ -440,6 +495,8 @@ struct GetMappedRangeWorkload : ApiWorkload {
wait(self->scanMappedRange(cx, 10, 490, mapper, self, MATCH_INDEX_UNMATCHED_ONLY, true));
}
// reset it to default
(const_cast<ServerKnobs*> SERVER_KNOBS)->STRICTLY_ENFORCE_BYTE_LIMIT = originalStrictlyEnforeByteLimit;
return Void();
}

View File

@ -292,7 +292,7 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
self->verifyServerKeyDest(params);
// test finish or started but cancelled movement
if (self->testStartOnly || deterministicRandom()->coinflip()) {
CODE_PROBE(true, "RawMovementApi partial started");
CODE_PROBE(true, "RawMovementApi partial started", probe::decoration::rare);
break;
}
@ -412,4 +412,4 @@ struct IDDTxnProcessorApiWorkload : TestWorkload {
void getMetrics(std::vector<PerfMetric>& m) override {}
};
WorkloadFactory<IDDTxnProcessorApiWorkload> IDDTxnProcessorApiWorkload;
WorkloadFactory<IDDTxnProcessorApiWorkload> IDDTxnProcessorApiWorkload;

View File

@ -18,9 +18,10 @@
* limitations under the License.
*/
#include "fdbrpc/TenantName.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/Tenant.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbrpc/TenantName.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
@ -31,12 +32,16 @@
struct StorageQuotaWorkload : TestWorkload {
static constexpr auto NAME = "StorageQuota";
TenantGroupName group;
TenantName tenant;
int nodeCount;
TenantName emptyTenant;
StorageQuotaWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
nodeCount = getOption(options, "nodeCount"_sr, 10000);
group = getOption(options, "group"_sr, "DefaultGroup"_sr);
tenant = getOption(options, "tenant"_sr, "DefaultTenant"_sr);
nodeCount = getOption(options, "nodeCount"_sr, 10000);
emptyTenant = getOption(options, "emptyTenant"_sr, "DefaultTenant"_sr);
}
Future<Void> setup(Database const& cx) override {
@ -67,27 +72,42 @@ struct StorageQuotaWorkload : TestWorkload {
Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(keyForIndex(n), value((n + 1) % nodeCount)); }
ACTOR Future<Void> _start(StorageQuotaWorkload* self, Database cx) {
// Check that the quota set/get functions work as expected.
// Set the quota to just below the current size.
state TenantMapEntry entry1 = wait(TenantAPI::getTenant(cx.getReference(), self->tenant));
state TenantMapEntry entry2 = wait(TenantAPI::getTenant(cx.getReference(), self->emptyTenant));
ASSERT(entry1.tenantGroup.present() && entry1.tenantGroup.get() == self->group &&
entry2.tenantGroup.present() && entry2.tenantGroup.get() == self->group);
// Get the size of the non-empty tenant. We will set the quota of the tenant group
// to just below the current size of this tenant.
state int64_t size = wait(getSize(cx, self->tenant));
state int64_t quota = size - 1;
wait(setStorageQuotaHelper(cx, self->tenant, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->tenant));
// Check that the quota set/get functions work as expected.
wait(setStorageQuotaHelper(cx, self->group, quota));
state Optional<int64_t> quotaRead = wait(getStorageQuotaHelper(cx, self->group));
ASSERT(quotaRead.present() && quotaRead.get() == quota);
if (!SERVER_KNOBS->DD_TENANT_AWARENESS_ENABLED) {
if (!SERVER_KNOBS->STORAGE_QUOTA_ENABLED) {
return Void();
}
// Check that writes are rejected when the tenant is over quota.
state bool rejected = wait(tryWrite(self, cx, /*expectOk=*/false));
ASSERT(rejected);
// Check that writes to both the tenants are rejected when the group is over quota.
state bool rejected1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/false));
ASSERT(rejected1);
state bool rejected2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/false));
ASSERT(rejected2);
// Increase the quota. Check that writes are now able to commit.
quota = size * 2;
wait(setStorageQuotaHelper(cx, self->tenant, quota));
state bool committed = wait(tryWrite(self, cx, /*expectOk=*/true));
ASSERT(committed);
// Increase the quota or clear the quota. Check that writes to both the tenants are now able to commit.
if (deterministicRandom()->coinflip()) {
quota = size * 2;
wait(setStorageQuotaHelper(cx, self->group, quota));
} else {
wait(clearStorageQuotaHelper(cx, self->group));
}
state bool committed1 = wait(tryWrite(self, cx, self->tenant, /*expectOk=*/true));
ASSERT(committed1);
state bool committed2 = wait(tryWrite(self, cx, self->emptyTenant, /*expectOk=*/true));
ASSERT(committed2);
return Void();
}
@ -115,11 +135,11 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantName tenantName, int64_t quota) {
ACTOR static Future<Void> setStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName, int64_t quota) {
state Transaction tr(cx);
loop {
try {
setStorageQuota(tr, tenantName, quota);
setStorageQuota(tr, tenantGroupName, quota);
wait(tr.commit());
return Void();
} catch (Error& e) {
@ -128,12 +148,24 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantName tenantName) {
ACTOR static Future<Void> clearStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx);
loop {
try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantName));
clearStorageQuota(tr, tenantGroupName);
wait(tr.commit());
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR static Future<Optional<int64_t>> getStorageQuotaHelper(Database cx, TenantGroupName tenantGroupName) {
state Transaction tr(cx);
loop {
try {
state Optional<int64_t> quota = wait(getStorageQuota(&tr, tenantGroupName));
return quota;
} catch (Error& e) {
wait(tr.onError(e));
@ -141,13 +173,13 @@ struct StorageQuotaWorkload : TestWorkload {
}
}
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, bool expectOk) {
ACTOR static Future<bool> tryWrite(StorageQuotaWorkload* self, Database cx, TenantName tenant, bool expectOk) {
state int i;
// Retry the transaction a few times if needed; this allows us wait for a while for all
// the storage usage and quota related monitors to fetch and propagate the latest information
// about the tenants that are over storage quota.
for (i = 0; i < 10; i++) {
state Transaction tr(cx, self->tenant);
state Transaction tr(cx, tenant);
loop {
try {
Standalone<KeyValueRef> kv =

View File

@ -26,8 +26,8 @@
// This workload sets the throughput quota of a tag during the setup phase
class ThroughputQuotaWorkload : public TestWorkload {
TransactionTag transactionTag;
double reservedQuota{ 0.0 };
double totalQuota{ 0.0 };
int64_t reservedQuota{ 0 };
int64_t totalQuota{ 0 };
ACTOR static Future<Void> setup(ThroughputQuotaWorkload* self, Database cx) {
state Reference<ReadYourWritesTransaction> tr = makeReference<ReadYourWritesTransaction>(cx);

View File

@ -118,14 +118,14 @@ Arena::Arena(Arena&& r) noexcept = default;
Arena& Arena::operator=(const Arena& r) = default;
Arena& Arena::operator=(Arena&& r) noexcept = default;
void Arena::dependsOn(const Arena& p) {
if (p.impl) {
// x.dependsOn(y) is a no-op if they refer to the same ArenaBlocks.
// They will already have the same lifetime.
if (p.impl && p.impl.getPtr() != impl.getPtr()) {
allowAccess(impl.getPtr());
allowAccess(p.impl.getPtr());
ArenaBlock::dependOn(impl, p.impl.getPtr());
disallowAccess(p.impl.getPtr());
if (p.impl.getPtr() != impl.getPtr()) {
disallowAccess(impl.getPtr());
}
disallowAccess(impl.getPtr());
}
}
@ -297,6 +297,7 @@ void* ArenaBlock::make4kAlignedBuffer(uint32_t size) {
}
void ArenaBlock::dependOn(Reference<ArenaBlock>& self, ArenaBlock* other) {
ASSERT(self->getData() != other->getData());
other->addref();
if (!self || self->isTiny() || self->unused() < sizeof(ArenaBlockRef))
create(SMALL, self)->makeReference(other);
@ -775,6 +776,16 @@ TEST_CASE("/flow/Arena/Size") {
return Void();
}
// Test that x.dependsOn(x) works, and is effectively a no-op.
TEST_CASE("/flow/Arena/SelfRef") {
Arena a(4096);
// This should be a no-op.
a.dependsOn(a);
return Void();
}
TEST_CASE("flow/StringRef/eat") {
StringRef str = "test/case"_sr;
StringRef first = str.eat("/");
@ -815,4 +826,4 @@ TEST_CASE("flow/StringRef/eat") {
ASSERT(str == ""_sr);
return Void();
}
}

View File

@ -29,21 +29,25 @@
#define PRIORITYMULTILOCK_ACTOR_H
#include "flow/flow.h"
#include <boost/intrusive/list.hpp>
#include "flow/actorcompiler.h" // This must be the last #include.
#define PRIORITYMULTILOCK_DEBUG 0
#if PRIORITYMULTILOCK_DEBUG || !defined(NO_INTELLISENSE)
#define pml_debug_printf(...) \
if (now() > 0) \
printf(__VA_ARGS__)
if (now() > 0) { \
printf("pml line=%04d ", __LINE__); \
printf(__VA_ARGS__); \
}
#else
#define pml_debug_printf(...)
#endif
// A multi user lock with a concurrent holder limit where waiters request a lock with a priority
// id and are granted locks based on a total concurrency and relative weights of the current active
// priorities. Priority id's must start at 0 and are sequential integers.
// priorities. Priority id's must start at 0 and are sequential integers. Priority id numbers
// are not related to the importance of the priority in execution.
//
// Scheduling logic
// Let
@ -64,17 +68,17 @@
// The interface is similar to FlowMutex except that lock holders can just drop the lock to release it.
//
// Usage:
// Lock lock = wait(prioritylock.lock(priorityLevel));
// Lock lock = wait(prioritylock.lock(priority_id));
// lock.release(); // Explicit release, or
// // let lock and all copies of lock go out of scope to release
class PriorityMultiLock {
class PriorityMultiLock : public ReferenceCounted<PriorityMultiLock> {
public:
// Waiting on the lock returns a Lock, which is really just a Promise<Void>
// Calling release() is not necessary, it exists in case the Lock holder wants to explicitly release
// the Lock before it goes out of scope.
struct Lock {
void release() { promise.send(Void()); }
bool isLocked() const { return promise.canBeSet(); }
// This is exposed in case the caller wants to use/copy it directly
Promise<Void> promise;
@ -84,10 +88,11 @@ public:
: PriorityMultiLock(concurrency, parseStringToVector<int>(weights, ',')) {}
PriorityMultiLock(int concurrency, std::vector<int> weightsByPriority)
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0), releaseDebugID(0) {
: concurrency(concurrency), available(concurrency), waiting(0), totalPendingWeights(0) {
priorities.resize(weightsByPriority.size());
for (int i = 0; i < priorities.size(); ++i) {
priorities[i].priority = i;
priorities[i].weight = weightsByPriority[i];
}
@ -102,7 +107,8 @@ public:
// If this priority currently has no waiters
if (q.empty()) {
// Add this priority's weight to the total for priorities with pending work
// Add this priority's weight to the total for priorities with pending work. This must be done
// so that currenctCapacity() below will assign capacaity to this priority.
totalPendingWeights += p.weight;
// If there are slots available and the priority has capacity then don't make the caller wait
@ -114,80 +120,71 @@ public:
Lock lock;
addRunner(lock, &p);
pml_debug_printf("lock nowait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock nowait priority %d %s\n", priority, toString().c_str());
return lock;
}
// If we didn't return above then add the priority to the waitingPriorities list
waitingPriorities.push_back(p);
}
Waiter w;
q.push_back(w);
Waiter& w = q.emplace_back();
++waiting;
pml_debug_printf("lock wait line %d priority %d %s\n", __LINE__, priority, toString().c_str());
pml_debug_printf("lock wait priority %d %s\n", priority, toString().c_str());
return w.lockPromise.getFuture();
}
void kill() {
pml_debug_printf("kill %s\n", toString().c_str());
brokenOnDestruct.reset();
// handleRelease will not free up any execution slots when it ends via cancel
fRunner.cancel();
available = 0;
runners.clear();
priorities.clear();
waitingPriorities.clear();
for (auto& p : priorities) {
p.queue.clear();
}
}
std::string toString() const {
int runnersDone = 0;
for (int i = 0; i < runners.size(); ++i) {
if (runners[i].isReady()) {
++runnersDone;
}
}
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d runnersQueue=%d "
"runnersDone=%d pendingWeights=%d ",
std::string s = format("{ ptr=%p concurrency=%d available=%d running=%d waiting=%d "
"pendingWeights=%d ",
this,
concurrency,
available,
concurrency - available,
waiting,
runners.size(),
runnersDone,
totalPendingWeights);
for (int i = 0; i < priorities.size(); ++i) {
s += format("p%d:{%s} ", i, priorities[i].toString(this).c_str());
for (auto& p : priorities) {
s += format("{%s} ", p.toString(this).c_str());
}
s += "}";
if (concurrency - available != runners.size() - runnersDone) {
pml_debug_printf("%s\n", s.c_str());
ASSERT_EQ(concurrency - available, runners.size() - runnersDone);
}
return s;
}
int maxPriority() const { return priorities.size() - 1; }
int totalWaiters() const { return waiting; }
int getRunnersCount() const { return concurrency - available; }
int getWaitersCount() const { return waiting; }
int numWaiters(const unsigned int priority) const {
int getWaitersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].queue.size();
}
int totalRunners() const { return concurrency - available; }
int numRunners(const unsigned int priority) const {
int getRunnersCount(const unsigned int priority) const {
ASSERT(priority < priorities.size());
return priorities[priority].runners;
}
private:
struct Waiter {
Waiter() {}
Promise<Lock> lockPromise;
};
@ -202,8 +199,8 @@ private:
typedef Deque<Waiter> Queue;
struct Priority {
Priority() : runners(0), weight(0) {}
struct Priority : boost::intrusive::list_base_hook<> {
Priority() : runners(0), weight(0), priority(-1) {}
// Queue of waiters at this priority
Queue queue;
@ -211,9 +208,12 @@ private:
int runners;
// Configured weight for this priority
int weight;
// Priority number for convenience, matches *this's index in PML priorities vector
int priority;
std::string toString(const PriorityMultiLock* pml) const {
return format("weight=%d run=%d wait=%d cap=%d",
return format("priority=%d weight=%d run=%d wait=%d cap=%d",
priority,
weight,
runners,
queue.size(),
@ -222,51 +222,41 @@ private:
};
std::vector<Priority> priorities;
typedef boost::intrusive::list<Priority, boost::intrusive::constant_time_size<false>> WaitingPrioritiesList;
// Current or recent (ended) runners
Deque<Future<Void>> runners;
// List of all priorities with 1 or more waiters. This list exists so that the scheduling loop
// does not have to iterage over the priorities vector checking priorities without waiters.
WaitingPrioritiesList waitingPriorities;
Future<Void> fRunner;
AsyncTrigger wakeRunner;
Promise<Void> brokenOnDestruct;
// Used for debugging, can roll over without issue
unsigned int releaseDebugID;
ACTOR static Future<Void> handleRelease(PriorityMultiLock* self, Future<Void> f, Priority* priority) {
state [[maybe_unused]] unsigned int id = self->releaseDebugID++;
pml_debug_printf("%f handleRelease self=%p id=%u start \n", now(), self, id);
ACTOR static void handleRelease(Reference<PriorityMultiLock> self, Priority* priority, Future<Void> holder) {
pml_debug_printf("%f handleRelease self=%p start\n", now(), self.getPtr());
try {
wait(f);
pml_debug_printf("%f handleRelease self=%p id=%u success\n", now(), self, id);
wait(holder);
pml_debug_printf("%f handleRelease self=%p success\n", now(), self.getPtr());
} catch (Error& e) {
pml_debug_printf("%f handleRelease self=%p id=%u error %s\n", now(), self, id, e.what());
if (e.code() == error_code_actor_cancelled) {
throw;
}
pml_debug_printf("%f handleRelease self=%p error %s\n", now(), self.getPtr(), e.what());
}
pml_debug_printf("lock release line %d priority %d %s\n",
__LINE__,
(int)(priority - &self->priorities.front()),
self->toString().c_str());
pml_debug_printf("lock release priority %d %s\n", (int)(priority->priority), self->toString().c_str());
pml_debug_printf("%f handleRelease self=%p id=%u releasing\n", now(), self, id);
pml_debug_printf("%f handleRelease self=%p releasing\n", now(), self.getPtr());
++self->available;
priority->runners -= 1;
// If there are any waiters or if the runners array is getting large, trigger the runner loop
if (self->waiting > 0 || self->runners.size() > 1000) {
if (self->waiting > 0) {
self->wakeRunner.trigger();
}
return Void();
}
void addRunner(Lock& lock, Priority* p) {
p->runners += 1;
void addRunner(Lock& lock, Priority* priority) {
priority->runners += 1;
--available;
runners.push_back(handleRelease(this, lock.promise.getFuture(), p));
handleRelease(Reference<PriorityMultiLock>::addRef(this), priority, lock.promise.getFuture());
}
// Current maximum running tasks for the specified priority, which must have waiters
@ -278,76 +268,50 @@ private:
}
ACTOR static Future<Void> runner(PriorityMultiLock* self) {
state int sinceYield = 0;
state Future<Void> error = self->brokenOnDestruct.getFuture();
// Priority to try to run tasks from next
state int priority = 0;
state WaitingPrioritiesList::iterator p = self->waitingPriorities.end();
loop {
pml_debug_printf(
"runner loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
// Cleanup finished runner futures at the front of the runner queue.
while (!self->runners.empty() && self->runners.front().isReady()) {
self->runners.pop_front();
}
pml_debug_printf("runner loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Wait for a runner to release its lock
pml_debug_printf(
"runner loop waitTrigger line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
pml_debug_printf("runner loop waitTrigger priority=%d %s\n", p->priority, self->toString().c_str());
wait(self->wakeRunner.onTrigger());
pml_debug_printf(
"%f runner loop wake line %d priority=%d %s\n", now(), __LINE__, priority, self->toString().c_str());
if (++sinceYield == 100) {
sinceYield = 0;
pml_debug_printf(
" runner waitDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
wait(delay(0));
pml_debug_printf(
" runner afterDelay line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
}
pml_debug_printf("%f runner loop wake priority=%d %s\n", now(), p->priority, self->toString().c_str());
// While there are available slots and there are waiters, launch tasks
while (self->available > 0 && self->waiting > 0) {
pml_debug_printf(
" launch loop start line %d priority=%d %s\n", __LINE__, priority, self->toString().c_str());
Priority* pPriority;
pml_debug_printf(" launch loop start priority=%d %s\n", p->priority, self->toString().c_str());
// Find the next priority with waiters and capacity. There must be at least one.
loop {
// Rotate to next priority
if (++priority == self->priorities.size()) {
priority = 0;
if (p == self->waitingPriorities.end()) {
p = self->waitingPriorities.begin();
}
pPriority = &self->priorities[priority];
pml_debug_printf(" launch loop scan priority=%d %s\n", p->priority, self->toString().c_str());
pml_debug_printf(" launch loop scan line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
if (!pPriority->queue.empty() && pPriority->runners < self->currentCapacity(pPriority->weight)) {
if (!p->queue.empty() && p->runners < self->currentCapacity(p->weight)) {
break;
}
++p;
}
Queue& queue = pPriority->queue;
Queue& queue = p->queue;
Waiter w = queue.front();
queue.pop_front();
// If this priority is now empty, subtract its weight from the total pending weights
// If this priority is now empty, subtract its weight from the total pending weights an remove it
// from the waitingPriorities list
Priority* pPriority = &*p;
if (queue.empty()) {
p = self->waitingPriorities.erase(p);
self->totalPendingWeights -= pPriority->weight;
pml_debug_printf(" emptied priority line %d priority=%d %s\n",
__LINE__,
priority,
self->toString().c_str());
pml_debug_printf(
" emptied priority priority=%d %s\n", pPriority->priority, self->toString().c_str());
}
--self->waiting;
@ -365,10 +329,9 @@ private:
self->addRunner(lock, pPriority);
}
pml_debug_printf(" launched line %d alreadyDone=%d priority=%d %s\n",
__LINE__,
pml_debug_printf(" launched alreadyDone=%d priority=%d %s\n",
!lock.promise.canBeSet(),
priority,
pPriority->priority,
self->toString().c_str());
}
}

View File

@ -0,0 +1,180 @@
/*
* BenchBlobDeltaFiles.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "benchmark/benchmark.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h"
#include "flow/IRandom.h"
#include "flow/DeterministicRandom.h"
#include "fdbclient/BlobGranuleFiles.h"
#include "flow/flow.h"
#include <cstdlib>
#include <stdexcept>
// Pre-generated GranuleDelta size in bytes for benchmark.
const static int PRE_GEN_TARGET_BYTES[] = { 128 * 1024, 512 * 1024, 1024 * 1024 };
// Generate GranuleDelta using a deterministic way. Change the seed if you would test a new data set
class DeltaGenerator {
public:
DeltaGenerator(uint32_t seed = 12345678) {
randGen = Reference<IRandom>(new DeterministicRandom(seed));
// Generate key range
prefix = StringRef(ar, randGen->randomUniqueID().toString() + "_");
range = KeyRangeRef(prefix, StringRef(ar, strinc(prefix)));
// Generate version jump size
minVersionJump = randGen->randomExp(0, 25);
maxVersionJump = minVersionJump + randGen->randomExp(0, 25);
// Generate value size range
maxValueSize = randGen->randomExp(7, 9);
// Generate start version
version = randGen->randomUInt32();
// Generate probabilty of update existing keys
updateExistingKeysProb = randGen->random01();
// Generate deltas
for (auto i : PRE_GEN_TARGET_BYTES) {
genDeltas(i);
}
fmt::print("key range: {} - {}\n", range.begin.printable(), range.end.printable());
fmt::print("start version: {}\n", version);
fmt::print("max value bytes: {}\n", maxValueSize);
fmt::print("version jump range: {} - {}\n", minVersionJump, maxVersionJump);
fmt::print("probability for update: {}\n", updateExistingKeysProb);
fmt::print("unseed: {}\n", randGen->randomUInt32());
}
KeyRange getRange() { return range; }
Standalone<GranuleDeltas> getDelta(int targetBytes) {
if (deltas.find(targetBytes) != deltas.end()) {
return deltas[targetBytes];
}
throw std::invalid_argument("Test delta file size is not pre-generated!");
}
private:
void genDeltas(int targetBytes) {
Standalone<GranuleDeltas> data;
int totalDataBytes = 0;
while (totalDataBytes < targetBytes) {
data.push_back(ar, newDelta());
totalDataBytes += data.back().expectedSize();
}
deltas[targetBytes] = data;
}
MutationRef newMutation() { return MutationRef(ar, MutationRef::SetValue, key(), value()); }
MutationsAndVersionRef newDelta() {
version += randGen->randomInt(minVersionJump, maxVersionJump);
MutationsAndVersionRef ret(version, version);
for (int i = 0; i < 10; i++) {
ret.mutations.push_back_deep(ar, newMutation());
}
return ret;
}
StringRef key() {
// Pick an existing key
if (randGen->random01() < updateExistingKeysProb && !usedKeys.empty()) {
int r = randGen->randomUInt32() % usedKeys.size();
auto it = usedKeys.begin();
for (; r != 0; r--)
it++;
return StringRef(ar, *it);
}
// Create a new key
std::string key = prefix.toString() + randGen->randomUniqueID().toString();
usedKeys.insert(key);
return StringRef(ar, key);
}
StringRef value() {
int valueSize = randGen->randomInt(maxValueSize / 2, maxValueSize * 3 / 2);
std::string value = randGen->randomUniqueID().toString();
if (value.size() > valueSize) {
value = value.substr(0, valueSize);
}
if (value.size() < valueSize) {
// repeated string so it's compressible
value += std::string(valueSize - value.size(), 'x');
}
return StringRef(ar, value);
}
Reference<IRandom> randGen;
Arena ar;
KeyRangeRef range;
Key prefix;
int maxValueSize;
Version version;
int minVersionJump;
int maxVersionJump;
std::set<std::string> usedKeys;
double updateExistingKeysProb;
std::map<int, Standalone<GranuleDeltas>> deltas;
};
static DeltaGenerator deltaGen; // Pre-generate deltas
// Benchmark serialization without compression/encryption. The main CPU cost should be sortDeltasByKey
static void bench_serialize_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
int chunkSize = state.range(1);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
Standalone<StringRef> fileName = "testdelta"_sr; // unused
Optional<CompressionFilter> compressFilter; // unused. no compression
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx; // unused. no encryption
uint32_t serializedBytes = 0;
for (auto _ : state) {
Value serialized = serializeChunkedDeltaFile(fileName, delta, range, chunkSize, compressFilter, cipherKeysCtx);
serializedBytes += serialized.size();
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
state.counters["serialized_bytes"] = serializedBytes;
}
// Benchmark sorting deltas
static void bench_sort_deltas(benchmark::State& state) {
int targetBytes = state.range(0);
Standalone<GranuleDeltas> delta = deltaGen.getDelta(targetBytes);
KeyRange range = deltaGen.getRange();
for (auto _ : state) {
sortDeltasByKey(delta, range);
}
state.SetBytesProcessed(static_cast<long>(state.iterations()) * targetBytes);
}
// Benchmark serialization for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_serialize_deltas)
->Args({ 128 * 1024, 32 * 1024 })
->Args({ 512 * 1024, 32 * 1024 })
->Args({ 1024 * 1024, 32 * 1024 });
// Benchmark sorting for granule deltas 128KB, 512KB and 1024KB. Chunk size 32KB
BENCHMARK(bench_sort_deltas)->Args({ 128 * 1024 })->Args({ 512 * 1024 })->Args({ 1024 * 1024 });

View File

@ -25,26 +25,28 @@
#include "flow/PriorityMultiLock.actor.h"
#include <deque>
#include "flow/actorcompiler.h" // This must be the last #include.
#include "fmt/printf.h"
ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
state std::vector<int> priorities;
// Arg1 is the number of active priorities to use
// Arg2 is the number of inactive priorities to use
state int active = benchState->range(0);
state int inactive = benchState->range(1);
// Set up priority list with limits 10, 20, 30, ...
while (priorities.size() < benchState->range(0)) {
state std::vector<int> priorities;
while (priorities.size() < active + inactive) {
priorities.push_back(10 * (priorities.size() + 1));
}
state int concurrency = priorities.size() * 10;
state PriorityMultiLock* pml = new PriorityMultiLock(concurrency, priorities);
state std::vector<int> counts;
counts.resize(priorities.size(), 0);
state Reference<PriorityMultiLock> pml = makeReference<PriorityMultiLock>(concurrency, priorities);
// Clog the lock buy taking concurrency locks
// Clog the lock buy taking n=concurrency locks
state std::deque<Future<PriorityMultiLock::Lock>> lockFutures;
for (int j = 0; j < concurrency; ++j) {
lockFutures.push_back(pml->lock(j % priorities.size()));
lockFutures.push_back(pml->lock(j % active));
}
// Wait for all of the initial locks to be taken
// This will work regardless of their priorities as there are only n = concurrency of them
wait(waitForAll(std::vector<Future<PriorityMultiLock::Lock>>(lockFutures.begin(), lockFutures.end())));
@ -64,7 +66,7 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
PriorityMultiLock::Lock lock = wait(f);
// Rotate to another priority
if (++p == priorities.size()) {
if (++p == active) {
p = 0;
}
@ -76,7 +78,6 @@ ACTOR static Future<Void> benchPriorityMultiLock(benchmark::State* benchState) {
benchState->SetItemsProcessed(static_cast<long>(benchState->iterations()));
delete pml;
return Void();
}
@ -84,4 +85,4 @@ static void bench_priorityMultiLock(benchmark::State& benchState) {
onMainThread([&benchState]() { return benchPriorityMultiLock(&benchState); }).blockUntilReady();
}
BENCHMARK(bench_priorityMultiLock)->DenseRange(1, 8)->ReportAggregatesOnly(true);
BENCHMARK(bench_priorityMultiLock)->Args({ 5, 0 })->Ranges({ { 1, 64 }, { 0, 128 } })->ReportAggregatesOnly(true);

View File

@ -240,7 +240,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml)
add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml)
add_fdb_test(TEST_FILES rare/Throttling.toml)
add_fdb_test(TEST_FILES rare/ThroughputQuota.toml IGNORE)
add_fdb_test(TEST_FILES rare/ThroughputQuota.toml)
add_fdb_test(TEST_FILES rare/TransactionCost.toml)
add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml)
add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml)

View File

@ -6,35 +6,60 @@ import subprocess
import os
import socket
import time
import fcntl
import sys
import tempfile
CLUSTER_UPDATE_TIMEOUT_SEC = 10
EXCLUDE_SERVERS_TIMEOUT_SEC = 120
RETRY_INTERVAL_SEC = 0.5
PORT_LOCK_DIR = Path(tempfile.gettempdir()).joinpath("fdb_local_cluster_port_locks")
MAX_PORT_ACQUIRE_ATTEMPTS = 1000
def _get_free_port_internal():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("0.0.0.0", 0))
return s.getsockname()[1]
class PortProvider:
def __init__(self):
self._used_ports = set()
self._lock_files = []
PORT_LOCK_DIR.mkdir(exist_ok=True)
def get_free_port(self):
counter = 0
while True:
counter += 1
if counter > MAX_PORT_ACQUIRE_ATTEMPTS:
assert False, "Failed to acquire a free port after {} attempts".format(MAX_PORT_ACQUIRE_ATTEMPTS)
port = PortProvider._get_free_port_internal()
if port in self._used_ports:
continue
lock_path = PORT_LOCK_DIR.joinpath("{}.lock".format(port))
try:
locked_fd = open(lock_path, "w+")
self._lock_files.append(locked_fd)
fcntl.lockf(locked_fd, fcntl.LOCK_EX)
self._used_ports.add(port)
return port
except OSError:
print("Failed to lock file {}. Trying to aquire another port".format(lock_path), file=sys.stderr)
pass
_used_ports = set()
def is_port_in_use(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(("localhost", port)) == 0
def _get_free_port_internal():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("0.0.0.0", 0))
return s.getsockname()[1]
def get_free_port():
global _used_ports
port = _get_free_port_internal()
while port in _used_ports:
port = _get_free_port_internal()
_used_ports.add(port)
return port
def is_port_in_use(port):
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(("localhost", port)) == 0
def release_locks(self):
for fd in self._lock_files:
fd.close()
try:
os.remove(fd.name)
except:
pass
self._lock_files.clear()
valid_letters_for_secret = string.ascii_letters + string.digits
@ -122,6 +147,7 @@ logdir = {logdir}
custom_config: dict = {},
public_key_json_str: str = "",
):
self.port_provider = PortProvider()
self.basedir = Path(basedir)
self.etc = self.basedir.joinpath("etc")
self.log = self.basedir.joinpath("log")
@ -188,7 +214,7 @@ logdir = {logdir}
def __next_port(self):
if self.first_port is None:
return get_free_port()
return self.port_provider.get_free_port()
else:
self.last_used_port += 1
return self.last_used_port
@ -284,7 +310,7 @@ logdir = {logdir}
in_use = False
for server_id in self.active_servers:
port = self.server_ports[server_id]
if is_port_in_use(port):
if PortProvider.is_port_in_use(port):
print("Port {} in use. Waiting for it to be released".format(port))
in_use = True
break
@ -300,6 +326,10 @@ logdir = {logdir}
def __exit__(self, xc_type, exc_value, traceback):
self.stop_cluster()
self.release_ports()
def release_ports(self):
self.port_provider.release_locks()
def __fdbcli_exec(self, cmd, stdout, stderr, timeout):
args = [self.fdbcli_binary, "-C", self.cluster_file, "--exec", cmd]
@ -334,9 +364,6 @@ logdir = {logdir}
db_config += " blob_granules_enabled:=1"
self.fdbcli_exec(db_config)
if self.blob_granules_enabled:
self.fdbcli_exec("blobrange start \\x00 \\xff")
# Generate and install test certificate chains and keys
def create_tls_cert(self):
assert self.tls_config is not None, "TLS not enabled"

View File

@ -168,6 +168,7 @@ class UpgradeTest:
def __exit__(self, xc_type, exc_value, traceback):
self.cluster.stop_cluster()
self.cluster.release_ports()
if CLEANUP_ON_EXIT:
shutil.rmtree(self.tmp_dir)

View File

@ -6,6 +6,7 @@ enable_encryption = true
enable_tlog_encryption = true
enable_storage_server_encryption = false
enable_blob_granule_encryption = true
max_write_transaction_life_versions = 5000000
[[test]]
testTitle = 'EncryptedBackupAndRestore'

View File

@ -8,20 +8,36 @@ testTitle = 'TenantCreation'
[[test.workload]]
testName = 'CreateTenant'
name = 'First'
group = 'GroupA'
[[test.workload]]
testName = 'CreateTenant'
name = 'Second'
group = 'GroupA'
[[test.workload]]
testName = 'CreateTenant'
name = 'Third'
group = 'GroupB'
[[test.workload]]
testName = 'CreateTenant'
name = 'Fourth'
group = 'GroupB'
[[test]]
testTitle = 'StorageQuota'
[[test.workload]]
testName = 'StorageQuota'
group = 'GroupA'
tenant = 'First'
nodeCount = 250000
emptyTenant = 'Second'
[[test.workload]]
testName = 'StorageQuota'
tenant = 'Second'
group = 'GroupB'
tenant = 'Third'
nodeCount = 25000
emptyTenant = 'Fourth'

View File

@ -4,15 +4,10 @@ testTitle='ThroughputQuotaTest'
[[test.workload]]
testName='ThroughputQuota'
transactionTag='a'
totalQuota=1.0
[[test.workload]]
testName='Status'
enableLatencyBands = true
testDuration = 60.0
totalQuota=16384
[[test.workload]]
testName = 'Cycle'
transactionsPerSecond = 2500.0
testDuration = 60.0
transactionsPerSecond = 250.0
testDuration = 30.0
expectedRate = 0